diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..66248cc0 --- /dev/null +++ b/.clang-format @@ -0,0 +1,23 @@ +BasedOnStyle: LLVM +IndentWidth: 2 +UseTab: Never +IndentCaseLabels: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: true +AllowShortFunctionsOnASingleLine: true +ColumnLimit: 100 +SortIncludes: false +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: true + BeforeElse: true + IndentBraces: false diff --git a/mpb/epsilon.c b/mpb/epsilon.c index 79d40a35..5182613a 100644 --- a/mpb/epsilon.c +++ b/mpb/epsilon.c @@ -19,67 +19,54 @@ /**************************************************************************/ -static void material_epsilon(material_type material, - symmetric_matrix *eps, symmetric_matrix *eps_inv) -{ - switch (material.which_subclass) { - case MEDIUM: - { - real eps_val = material.subclass.medium_data->epsilon; - eps->m00 = eps->m11 = eps->m22 = eps_val; - eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; +static void material_epsilon(material_type material, symmetric_matrix *eps, + symmetric_matrix *eps_inv) { + switch (material.which_subclass) { + case MEDIUM: { + real eps_val = material.subclass.medium_data->epsilon; + eps->m00 = eps->m11 = eps->m22 = eps_val; + eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; #ifdef WITH_HERMITIAN_EPSILON - CASSIGN_ZERO(eps->m01); - CASSIGN_ZERO(eps->m02); - CASSIGN_ZERO(eps->m12); - CASSIGN_ZERO(eps_inv->m01); - CASSIGN_ZERO(eps_inv->m02); - CASSIGN_ZERO(eps_inv->m12); + CASSIGN_ZERO(eps->m01); + CASSIGN_ZERO(eps->m02); + CASSIGN_ZERO(eps->m12); + CASSIGN_ZERO(eps_inv->m01); + CASSIGN_ZERO(eps_inv->m02); + CASSIGN_ZERO(eps_inv->m12); #else - eps->m01 = eps->m02 = eps->m12 = 0.0; - eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; + eps->m01 = eps->m02 = eps->m12 = 0.0; + eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; #endif - break; - } - case MEDIUM_ANISOTROPIC: - { - medium_anisotropic *d = - material.subclass.medium_anisotropic_data; - eps->m00 = d->epsilon_diag.x; - eps->m11 = d->epsilon_diag.y; - eps->m22 = d->epsilon_diag.z; + break; + } + case MEDIUM_ANISOTROPIC: { + medium_anisotropic *d = material.subclass.medium_anisotropic_data; + eps->m00 = d->epsilon_diag.x; + eps->m11 = d->epsilon_diag.y; + eps->m22 = d->epsilon_diag.z; #ifdef WITH_HERMITIAN_EPSILON - CASSIGN_SCALAR(eps->m01, d->epsilon_offdiag.x.re, - d->epsilon_offdiag.x.im + - d->epsilon_offdiag_imag.x); - CASSIGN_SCALAR(eps->m02, d->epsilon_offdiag.y.re, - d->epsilon_offdiag.y.im + - d->epsilon_offdiag_imag.y); - CASSIGN_SCALAR(eps->m12, d->epsilon_offdiag.z.re, - d->epsilon_offdiag.z.im + - d->epsilon_offdiag_imag.z); + CASSIGN_SCALAR(eps->m01, d->epsilon_offdiag.x.re, + d->epsilon_offdiag.x.im + d->epsilon_offdiag_imag.x); + CASSIGN_SCALAR(eps->m02, d->epsilon_offdiag.y.re, + d->epsilon_offdiag.y.im + d->epsilon_offdiag_imag.y); + CASSIGN_SCALAR(eps->m12, d->epsilon_offdiag.z.re, + d->epsilon_offdiag.z.im + d->epsilon_offdiag_imag.z); #else - eps->m01 = d->epsilon_offdiag.x.re; - eps->m02 = d->epsilon_offdiag.y.re; - eps->m12 = d->epsilon_offdiag.z.re; - CHECK(vector3_norm(vector3_plus( - cvector3_im(d->epsilon_offdiag), - d->epsilon_offdiag_imag)) == 0.0, - "imaginary epsilon-offdiag is only supported when MPB is configured --with-hermitian-eps"); + eps->m01 = d->epsilon_offdiag.x.re; + eps->m02 = d->epsilon_offdiag.y.re; + eps->m12 = d->epsilon_offdiag.z.re; + CHECK(vector3_norm(vector3_plus(cvector3_im(d->epsilon_offdiag), d->epsilon_offdiag_imag)) == + 0.0, + "imaginary epsilon-offdiag is only supported when MPB is configured " + "--with-hermitian-eps"); #endif - maxwell_sym_matrix_invert(eps_inv, eps); - break; - } - case MATERIAL_GRID: - CHECK(0, "invalid use of material-grid"); - break; - case MATERIAL_FUNCTION: - CHECK(0, "invalid use of material-function"); - break; - case MATERIAL_TYPE_SELF: - CHECK(0, "invalid use of material-type"); - break; - } + maxwell_sym_matrix_invert(eps_inv, eps); + break; + } + case MATERIAL_GRID: CHECK(0, "invalid use of material-grid"); break; + case MATERIAL_FUNCTION: CHECK(0, "invalid use of material-function"); break; + case MATERIAL_TYPE_SELF: CHECK(0, "invalid use of material-type"); break; + } } /* Given a position r in the basis of the lattice vectors, return the @@ -90,135 +77,141 @@ static void material_epsilon(material_type material, This function is passed to set_maxwell_dielectric to initialize the dielectric tensor array for eigenvector calculations. */ -static void epsilon_func(symmetric_matrix *eps, symmetric_matrix *eps_inv, - const real r[3], void *edata) -{ - medium_func_data *d = (medium_func_data *) edata; - geom_box_tree tp; - int oi; - material_type material; - vector3 p; - boolean inobject; - - /* p needs to be in the lattice *unit* vector basis, while r is - in the lattice vector basis. Also, shift origin to the center - of the grid. */ - p.x = no_size_x ? 0 : (r[0] - 0.5) * geometry_lattice.size.x; - p.y = no_size_y ? 0 : (r[1] - 0.5) * geometry_lattice.size.y; - p.z = no_size_z ? 0 : (r[2] - 0.5) * geometry_lattice.size.z; - - /* call search routine from libctl/utils/libgeom/geom.c: - (we have to use the lower-level geom_tree_search to - support material-grid types, which have funny semantics) */ - tp = geom_tree_search(p = shift_to_unit_cell(p), geometry_tree, &oi); - if (tp) { - inobject = 1; - material = tp->objects[oi].o->material; - } - else { - inobject = 0; - material = default_material; - } +static void epsilon_func(symmetric_matrix *eps, symmetric_matrix *eps_inv, const real r[3], + void *edata) { + medium_func_data *d = (medium_func_data *)edata; + geom_box_tree tp; + int oi; + material_type material; + vector3 p; + boolean inobject; + + /* p needs to be in the lattice *unit* vector basis, while r is + in the lattice vector basis. Also, shift origin to the center + of the grid. */ + p.x = no_size_x ? 0 : (r[0] - 0.5) * geometry_lattice.size.x; + p.y = no_size_y ? 0 : (r[1] - 0.5) * geometry_lattice.size.y; + p.z = no_size_z ? 0 : (r[2] - 0.5) * geometry_lattice.size.z; + + /* call search routine from libctl/utils/libgeom/geom.c: + (we have to use the lower-level geom_tree_search to + support material-grid types, which have funny semantics) */ + tp = geom_tree_search(p = shift_to_unit_cell(p), geometry_tree, &oi); + if (tp) { + inobject = 1; + material = tp->objects[oi].o->material; + } + else { + inobject = 0; + material = default_material; + } #ifdef DEBUG_GEOMETRY_TREE - { - material_type m2 = material_of_point_inobject(p, &inobject); - CHECK(m2.which_subclass == material.which_subclass && - m2.subclass.medium_data == - material.subclass.medium_data, - "material_of_point & material_of_point_in_tree don't agree!"); - } + { + material_type m2 = material_of_point_inobject(p, &inobject); + CHECK(m2.which_subclass == material.which_subclass && + m2.subclass.medium_data == material.subclass.medium_data, + "material_of_point & material_of_point_in_tree don't agree!"); + } #endif - if (material.which_subclass == MATERIAL_TYPE_SELF) { - material = default_material; - tp = 0; inobject = 0; /* treat as a "nothing" object */ - } - - /* if we aren't in any geometric object and we have an epsilon - file, use that. */ - if (!inobject && d->epsilon_file_func) { - d->epsilon_file_func(eps, eps_inv, r, d->epsilon_file_func_data); - } - else { - boolean destroy_material = 0; - - while (material.which_subclass == MATERIAL_FUNCTION) { - material_type m; - SCM mo; - /* material_func is a Scheme function, taking a position - vector and returning a material at that point: */ - mo = gh_call1(material.subclass. - material_function_data->material_func, - ctl_convert_vector3_to_scm(p)); - material_type_input(mo, &m); - if (destroy_material) - material_type_destroy(material); - material = m; - destroy_material = 1; - } - - /* For a material grid, we interpolate the point (in "object" - coordinates) into the grid. More than that, however, - we check if the same point intersects the *same* material grid - from multiple objects -- if so, we take the product of - the interpolated grid values. */ - if (material.which_subclass == MATERIAL_GRID) { - material_type mat_eps; - mat_eps = make_epsilon( - matgrid_val(p, tp, oi, - material.subclass.material_grid_data) - * (material.subclass.material_grid_data->epsilon_max - - material.subclass.material_grid_data->epsilon_min) + - material.subclass.material_grid_data->epsilon_min); - if (destroy_material) - material_type_destroy(material); - material = mat_eps; - destroy_material = 1; - } - - material_epsilon(material, eps, eps_inv); - if (destroy_material) - material_type_destroy(material); - } + if (material.which_subclass == MATERIAL_TYPE_SELF) { + material = default_material; + tp = 0; + inobject = 0; /* treat as a "nothing" object */ + } + + /* if we aren't in any geometric object and we have an epsilon + file, use that. */ + if (!inobject && d->epsilon_file_func) { + d->epsilon_file_func(eps, eps_inv, r, d->epsilon_file_func_data); + } + else { + boolean destroy_material = 0; + + while (material.which_subclass == MATERIAL_FUNCTION) { + material_type m; + SCM mo; + /* material_func is a Scheme function, taking a position + vector and returning a material at that point: */ + mo = gh_call1(material.subclass.material_function_data->material_func, + ctl_convert_vector3_to_scm(p)); + material_type_input(mo, &m); + if (destroy_material) material_type_destroy(material); + material = m; + destroy_material = 1; + } + + /* For a material grid, we interpolate the point (in "object" + coordinates) into the grid. More than that, however, + we check if the same point intersects the *same* material grid + from multiple objects -- if so, we take the product of + the interpolated grid values. */ + if (material.which_subclass == MATERIAL_GRID) { + material_type mat_eps; + mat_eps = make_epsilon(matgrid_val(p, tp, oi, material.subclass.material_grid_data) * + (material.subclass.material_grid_data->epsilon_max - + material.subclass.material_grid_data->epsilon_min) + + material.subclass.material_grid_data->epsilon_min); + if (destroy_material) material_type_destroy(material); + material = mat_eps; + destroy_material = 1; + } + + material_epsilon(material, eps, eps_inv); + if (destroy_material) material_type_destroy(material); + } } -static int mean_epsilon_func(symmetric_matrix *meps, - symmetric_matrix *meps_inv, - real n[3], - real d1, real d2, real d3, real tol, - const real r[3], void *edata) -{ - medium_func_data *d = (medium_func_data *) edata; - vector3 p; - const geometric_object *o1 = 0, *o2 = 0; - vector3 shiftby1, shiftby2, normal; - geom_box pixel; - double fill; - material_type mat1, mat2; - int id1 = -1, id2 = -1; - int i; - const int num_neighbors[3] = { 3, 5, 9 }; - const int neighbors[3][9][3] = { - { {0,0,0}, {-1,0,0}, {1,0,0}, - {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0} }, - { {0,0,0}, - {-1,-1,0}, {1,1,0}, {-1,1,0}, {1,-1,0}, - {0,0,0},{0,0,0},{0,0,0},{0,0,0} }, - { {0,0,0}, - {1,1,1},{1,1,-1},{1,-1,1},{1,-1,-1}, - {-1,1,1},{-1,1,-1},{-1,-1,1},{-1,-1,-1} } - }; - - /* p needs to be in the lattice *unit* vector basis, while r is - in the lattice vector basis. Also, shift origin to the center - of the grid. */ - p.x = no_size_x ? 0 : (r[0] - 0.5) * geometry_lattice.size.x; - p.y = no_size_y ? 0 : (r[1] - 0.5) * geometry_lattice.size.y; - p.z = no_size_z ? 0 : (r[2] - 0.5) * geometry_lattice.size.z; - d1 *= no_size_x ? 0 : geometry_lattice.size.x * 0.5; - d2 *= no_size_y ? 0 : geometry_lattice.size.y * 0.5; - d3 *= no_size_z ? 0 : geometry_lattice.size.z * 0.5; +static int mean_epsilon_func(symmetric_matrix *meps, symmetric_matrix *meps_inv, real n[3], real d1, + real d2, real d3, real tol, const real r[3], void *edata) { + medium_func_data *d = (medium_func_data *)edata; + vector3 p; + const geometric_object *o1 = 0, *o2 = 0; + vector3 shiftby1, shiftby2, normal; + geom_box pixel; + double fill; + material_type mat1, mat2; + int id1 = -1, id2 = -1; + int i; + const int num_neighbors[3] = {3, 5, 9}; + const int neighbors[3][9][3] = {{{0, 0, 0}, + {-1, 0, 0}, + {1, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}}, + {{0, 0, 0}, + {-1, -1, 0}, + {1, 1, 0}, + {-1, 1, 0}, + {1, -1, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}}, + {{0, 0, 0}, + {1, 1, 1}, + {1, 1, -1}, + {1, -1, 1}, + {1, -1, -1}, + {-1, 1, 1}, + {-1, 1, -1}, + {-1, -1, 1}, + {-1, -1, -1}}}; + + /* p needs to be in the lattice *unit* vector basis, while r is + in the lattice vector basis. Also, shift origin to the center + of the grid. */ + p.x = no_size_x ? 0 : (r[0] - 0.5) * geometry_lattice.size.x; + p.y = no_size_y ? 0 : (r[1] - 0.5) * geometry_lattice.size.y; + p.z = no_size_z ? 0 : (r[2] - 0.5) * geometry_lattice.size.z; + d1 *= no_size_x ? 0 : geometry_lattice.size.x * 0.5; + d2 *= no_size_y ? 0 : geometry_lattice.size.y * 0.5; + d3 *= no_size_z ? 0 : geometry_lattice.size.z * 0.5; #if 0 /* no averaging */ epsilon_func(meps, meps_inv, r, edata); @@ -226,357 +219,321 @@ static int mean_epsilon_func(symmetric_matrix *meps, return 1; #endif - for (i = 0; i < num_neighbors[dimensions - 1]; ++i) { - const geometric_object *o; - material_type mat; - vector3 q, z, shiftby; - int id; - q.x = p.x + neighbors[dimensions - 1][i][0] * d1; - q.y = p.y + neighbors[dimensions - 1][i][1] * d2; - q.z = p.z + neighbors[dimensions - 1][i][2] * d3; - z = shift_to_unit_cell(q); - o = object_of_point_in_tree(z, geometry_tree, &shiftby, &id); - shiftby = vector3_plus(shiftby, vector3_minus(q, z)); - if ((id == id1 && vector3_equal(shiftby, shiftby1)) || - (id == id2 && vector3_equal(shiftby, shiftby2))) - continue; - mat = (o && o->material.which_subclass != MATERIAL_TYPE_SELF) - ? o->material : default_material; - if (id1 == -1) { - o1 = o; - shiftby1 = shiftby; - id1 = id; - mat1 = mat; - } - else if (id2 == -1 || ((id >= id1 && id >= id2) && - (id1 == id2 - || material_type_equal(&mat1,&mat2)))) { - o2 = o; - shiftby2 = shiftby; - id2 = id; - mat2 = mat; - } - else if (!(id1 < id2 && - (id1 == id || material_type_equal(&mat1,&mat))) && - !(id2 < id1 && - (id2 == id || material_type_equal(&mat2,&mat)))) - return 0; /* too many nearby objects for analysis */ - } - - CHECK(id1 > -1, "bug in object_of_point_in_tree?"); - if (id2 == -1) { /* only one nearby object/material */ - id2 = id1; - o2 = o1; - mat2 = mat1; - shiftby2 = shiftby1; - } - - if ((o1 && variable_material(o1->material.which_subclass)) || - (o2 && variable_material(o2->material.which_subclass)) || - ((variable_material(default_material.which_subclass) - || d->epsilon_file_func) - && (!o1 || !o2 || - o1->material.which_subclass == MATERIAL_TYPE_SELF || - o2->material.which_subclass == MATERIAL_TYPE_SELF))) - return 0; /* arbitrary material functions are non-analyzable */ - - material_epsilon(mat1, meps, meps_inv); - - /* check for trivial case of only one object/material */ - if (id1 == id2 || material_type_equal(&mat1, &mat2)) { - n[0] = n[1] = n[2] = 0; - return 1; - } - - if (id1 > id2) - normal = normal_to_fixed_object(vector3_minus(p, shiftby1), *o1); - else - normal = normal_to_fixed_object(vector3_minus(p, shiftby2), *o2); - - n[0] = no_size_x ? 0 : normal.x / geometry_lattice.size.x; - n[1] = no_size_y ? 0 : normal.y / geometry_lattice.size.y; - n[2] = no_size_z ? 0 : normal.z / geometry_lattice.size.z; - - pixel.low.x = p.x - d1; - pixel.high.x = p.x + d1; - pixel.low.y = p.y - d2; - pixel.high.y = p.y + d2; - pixel.low.z = p.z - d3; - pixel.high.z = p.z + d3; - - tol = tol > 0.01 ? 0.01 : tol; - if (id1 > id2) { - pixel.low = vector3_minus(pixel.low, shiftby1); - pixel.high = vector3_minus(pixel.high, shiftby1); - fill = box_overlap_with_object(pixel, *o1, tol, 100/tol); - } - else { - pixel.low = vector3_minus(pixel.low, shiftby2); - pixel.high = vector3_minus(pixel.high, shiftby2); - fill = 1 - box_overlap_with_object(pixel, *o2, tol, 100/tol); - } - - { - symmetric_matrix eps2, epsinv2; - symmetric_matrix eps1, delta; - double Rot[3][3], norm, n0, n1, n2; - material_epsilon(mat2, &eps2, &epsinv2); - eps1 = *meps; - - /* make Cartesian orthonormal frame relative to interface */ - n0 = R[0][0] * n[0] + R[1][0] * n[1] + R[2][0] * n[2]; - n1 = R[0][1] * n[0] + R[1][1] * n[1] + R[2][1] * n[2]; - n2 = R[0][2] * n[0] + R[1][2] * n[1] + R[2][2] * n[2]; - norm = sqrt(n0*n0 + n1*n1 + n2*n2); - if (norm == 0.0) - return 0; - norm = 1.0 / norm; - Rot[0][0] = n0 = n0 * norm; - Rot[1][0] = n1 = n1 * norm; - Rot[2][0] = n2 = n2 * norm; - if (fabs(n0) > 1e-2 || fabs(n1) > 1e-2) { /* (z x n) */ - Rot[0][2] = n1; - Rot[1][2] = -n0; - Rot[2][2] = 0; - } - else { /* n is ~ parallel to z direction, use (x x n) instead */ - Rot[0][2] = 0; - Rot[1][2] = -n2; - Rot[2][2] = n1; - } - { /* normalize second column */ - double s = Rot[0][2]*Rot[0][2]+Rot[1][2]*Rot[1][2]+Rot[2][2]*Rot[2][2]; - s = 1.0 / sqrt(s); - Rot[0][2] *= s; - Rot[1][2] *= s; - Rot[2][2] *= s; - } - /* 1st column is 2nd column x 0th column */ - Rot[0][1] = Rot[1][2] * Rot[2][0] - Rot[2][2] * Rot[1][0]; - Rot[1][1] = Rot[2][2] * Rot[0][0] - Rot[0][2] * Rot[2][0]; - Rot[2][1] = Rot[0][2] * Rot[1][0] - Rot[1][2] * Rot[0][0]; - - /* rotate epsilon tensors to surface parallel/perpendicular axes */ - maxwell_sym_matrix_rotate(&eps1, &eps1, Rot); - maxwell_sym_matrix_rotate(&eps2, &eps2, Rot); - -#define AVG (fill * (EXPR(eps1)) + (1-fill) * (EXPR(eps2))) + for (i = 0; i < num_neighbors[dimensions - 1]; ++i) { + const geometric_object *o; + material_type mat; + vector3 q, z, shiftby; + int id; + q.x = p.x + neighbors[dimensions - 1][i][0] * d1; + q.y = p.y + neighbors[dimensions - 1][i][1] * d2; + q.z = p.z + neighbors[dimensions - 1][i][2] * d3; + z = shift_to_unit_cell(q); + o = object_of_point_in_tree(z, geometry_tree, &shiftby, &id); + shiftby = vector3_plus(shiftby, vector3_minus(q, z)); + if ((id == id1 && vector3_equal(shiftby, shiftby1)) || + (id == id2 && vector3_equal(shiftby, shiftby2))) + continue; + mat = (o && o->material.which_subclass != MATERIAL_TYPE_SELF) ? o->material : default_material; + if (id1 == -1) { + o1 = o; + shiftby1 = shiftby; + id1 = id; + mat1 = mat; + } + else if (id2 == -1 || + ((id >= id1 && id >= id2) && (id1 == id2 || material_type_equal(&mat1, &mat2)))) { + o2 = o; + shiftby2 = shiftby; + id2 = id; + mat2 = mat; + } + else if (!(id1 < id2 && (id1 == id || material_type_equal(&mat1, &mat))) && + !(id2 < id1 && (id2 == id || material_type_equal(&mat2, &mat)))) + return 0; /* too many nearby objects for analysis */ + } + + CHECK(id1 > -1, "bug in object_of_point_in_tree?"); + if (id2 == -1) { /* only one nearby object/material */ + id2 = id1; + o2 = o1; + mat2 = mat1; + shiftby2 = shiftby1; + } + + if ((o1 && variable_material(o1->material.which_subclass)) || + (o2 && variable_material(o2->material.which_subclass)) || + ((variable_material(default_material.which_subclass) || d->epsilon_file_func) && + (!o1 || !o2 || o1->material.which_subclass == MATERIAL_TYPE_SELF || + o2->material.which_subclass == MATERIAL_TYPE_SELF))) + return 0; /* arbitrary material functions are non-analyzable */ + + material_epsilon(mat1, meps, meps_inv); + + /* check for trivial case of only one object/material */ + if (id1 == id2 || material_type_equal(&mat1, &mat2)) { + n[0] = n[1] = n[2] = 0; + return 1; + } + + if (id1 > id2) + normal = normal_to_fixed_object(vector3_minus(p, shiftby1), *o1); + else + normal = normal_to_fixed_object(vector3_minus(p, shiftby2), *o2); + + n[0] = no_size_x ? 0 : normal.x / geometry_lattice.size.x; + n[1] = no_size_y ? 0 : normal.y / geometry_lattice.size.y; + n[2] = no_size_z ? 0 : normal.z / geometry_lattice.size.z; + + pixel.low.x = p.x - d1; + pixel.high.x = p.x + d1; + pixel.low.y = p.y - d2; + pixel.high.y = p.y + d2; + pixel.low.z = p.z - d3; + pixel.high.z = p.z + d3; + + tol = tol > 0.01 ? 0.01 : tol; + if (id1 > id2) { + pixel.low = vector3_minus(pixel.low, shiftby1); + pixel.high = vector3_minus(pixel.high, shiftby1); + fill = box_overlap_with_object(pixel, *o1, tol, 100 / tol); + } + else { + pixel.low = vector3_minus(pixel.low, shiftby2); + pixel.high = vector3_minus(pixel.high, shiftby2); + fill = 1 - box_overlap_with_object(pixel, *o2, tol, 100 / tol); + } + + { + symmetric_matrix eps2, epsinv2; + symmetric_matrix eps1, delta; + double Rot[3][3], norm, n0, n1, n2; + material_epsilon(mat2, &eps2, &epsinv2); + eps1 = *meps; + + /* make Cartesian orthonormal frame relative to interface */ + n0 = R[0][0] * n[0] + R[1][0] * n[1] + R[2][0] * n[2]; + n1 = R[0][1] * n[0] + R[1][1] * n[1] + R[2][1] * n[2]; + n2 = R[0][2] * n[0] + R[1][2] * n[1] + R[2][2] * n[2]; + norm = sqrt(n0 * n0 + n1 * n1 + n2 * n2); + if (norm == 0.0) return 0; + norm = 1.0 / norm; + Rot[0][0] = n0 = n0 * norm; + Rot[1][0] = n1 = n1 * norm; + Rot[2][0] = n2 = n2 * norm; + if (fabs(n0) > 1e-2 || fabs(n1) > 1e-2) { /* (z x n) */ + Rot[0][2] = n1; + Rot[1][2] = -n0; + Rot[2][2] = 0; + } + else { /* n is ~ parallel to z direction, use (x x n) instead */ + Rot[0][2] = 0; + Rot[1][2] = -n2; + Rot[2][2] = n1; + } + { /* normalize second column */ + double s = Rot[0][2] * Rot[0][2] + Rot[1][2] * Rot[1][2] + Rot[2][2] * Rot[2][2]; + s = 1.0 / sqrt(s); + Rot[0][2] *= s; + Rot[1][2] *= s; + Rot[2][2] *= s; + } + /* 1st column is 2nd column x 0th column */ + Rot[0][1] = Rot[1][2] * Rot[2][0] - Rot[2][2] * Rot[1][0]; + Rot[1][1] = Rot[2][2] * Rot[0][0] - Rot[0][2] * Rot[2][0]; + Rot[2][1] = Rot[0][2] * Rot[1][0] - Rot[1][2] * Rot[0][0]; + + /* rotate epsilon tensors to surface parallel/perpendicular axes */ + maxwell_sym_matrix_rotate(&eps1, &eps1, Rot); + maxwell_sym_matrix_rotate(&eps2, &eps2, Rot); + +#define AVG (fill * (EXPR(eps1)) + (1 - fill) * (EXPR(eps2))) #define EXPR(eps) (-1 / eps.m00) - delta.m00 = AVG; + delta.m00 = AVG; #undef EXPR #define EXPR(eps) (eps.m11 - ESCALAR_NORMSQR(eps.m01) / eps.m00) - delta.m11 = AVG; + delta.m11 = AVG; #undef EXPR #define EXPR(eps) (eps.m22 - ESCALAR_NORMSQR(eps.m02) / eps.m00) - delta.m22 = AVG; + delta.m22 = AVG; #undef EXPR #define EXPR(eps) (ESCALAR_RE(eps.m01) / eps.m00) - ESCALAR_RE(delta.m01) = AVG; + ESCALAR_RE(delta.m01) = AVG; #undef EXPR #define EXPR(eps) (ESCALAR_RE(eps.m02) / eps.m00) - ESCALAR_RE(delta.m02) = AVG; + ESCALAR_RE(delta.m02) = AVG; #undef EXPR #define EXPR(eps) (ESCALAR_RE(eps.m12) - ESCALAR_MULT_CONJ_RE(eps.m02, eps.m01) / eps.m00) - ESCALAR_RE(delta.m12) = AVG; + ESCALAR_RE(delta.m12) = AVG; #undef EXPR #ifdef WITH_HERMITIAN_EPSILON -# define EXPR(eps) (ESCALAR_IM(eps.m01) / eps.m00) - ESCALAR_IM(delta.m01) = AVG; -# undef EXPR -# define EXPR(eps) (ESCALAR_IM(eps.m02) / eps.m00) - ESCALAR_IM(delta.m02) = AVG; -# undef EXPR -# define EXPR(eps) (ESCALAR_IM(eps.m12) - ESCALAR_MULT_CONJ_IM(eps.m02, eps.m01) / eps.m00) - ESCALAR_IM(delta.m12) = AVG; -# undef EXPR +#define EXPR(eps) (ESCALAR_IM(eps.m01) / eps.m00) + ESCALAR_IM(delta.m01) = AVG; +#undef EXPR +#define EXPR(eps) (ESCALAR_IM(eps.m02) / eps.m00) + ESCALAR_IM(delta.m02) = AVG; +#undef EXPR +#define EXPR(eps) (ESCALAR_IM(eps.m12) - ESCALAR_MULT_CONJ_IM(eps.m02, eps.m01) / eps.m00) + ESCALAR_IM(delta.m12) = AVG; +#undef EXPR #endif /* WITH_HERMITIAN_EPSILON */ - meps->m00 = -1/delta.m00; - meps->m11 = delta.m11 - ESCALAR_NORMSQR(delta.m01) / delta.m00; - meps->m22 = delta.m22 - ESCALAR_NORMSQR(delta.m02) / delta.m00; - ASSIGN_ESCALAR(meps->m01, -ESCALAR_RE(delta.m01)/delta.m00, - -ESCALAR_IM(delta.m01)/delta.m00); - ASSIGN_ESCALAR(meps->m02, -ESCALAR_RE(delta.m02)/delta.m00, - -ESCALAR_IM(delta.m02)/delta.m00); - ASSIGN_ESCALAR(meps->m12, - ESCALAR_RE(delta.m12) - - ESCALAR_MULT_CONJ_RE(delta.m02, delta.m01)/delta.m00, - ESCALAR_IM(delta.m12) - - ESCALAR_MULT_CONJ_IM(delta.m02, delta.m01)/delta.m00); - -#define SWAP(a,b) { double xxx = a; a = b; b = xxx; } - /* invert rotation matrix = transpose */ - SWAP(Rot[0][1], Rot[1][0]); - SWAP(Rot[0][2], Rot[2][0]); - SWAP(Rot[2][1], Rot[1][2]); - maxwell_sym_matrix_rotate(meps, meps, Rot); /* rotate back */ + meps->m00 = -1 / delta.m00; + meps->m11 = delta.m11 - ESCALAR_NORMSQR(delta.m01) / delta.m00; + meps->m22 = delta.m22 - ESCALAR_NORMSQR(delta.m02) / delta.m00; + ASSIGN_ESCALAR(meps->m01, -ESCALAR_RE(delta.m01) / delta.m00, + -ESCALAR_IM(delta.m01) / delta.m00); + ASSIGN_ESCALAR(meps->m02, -ESCALAR_RE(delta.m02) / delta.m00, + -ESCALAR_IM(delta.m02) / delta.m00); + ASSIGN_ESCALAR(meps->m12, + ESCALAR_RE(delta.m12) - ESCALAR_MULT_CONJ_RE(delta.m02, delta.m01) / delta.m00, + ESCALAR_IM(delta.m12) - ESCALAR_MULT_CONJ_IM(delta.m02, delta.m01) / delta.m00); + +#define SWAP(a, b) \ + { \ + double xxx = a; \ + a = b; \ + b = xxx; \ + } + /* invert rotation matrix = transpose */ + SWAP(Rot[0][1], Rot[1][0]); + SWAP(Rot[0][2], Rot[2][0]); + SWAP(Rot[2][1], Rot[1][2]); + maxwell_sym_matrix_rotate(meps, meps, Rot); /* rotate back */ #undef SWAP -# ifdef DEBUG - CHECK(negative_epsilon_okp - || maxwell_sym_matrix_positive_definite(meps), - "negative mean epsilon from Kottke algorithm"); -# endif - } +#ifdef DEBUG + CHECK(negative_epsilon_okp || maxwell_sym_matrix_positive_definite(meps), + "negative mean epsilon from Kottke algorithm"); +#endif + } - return 1; + return 1; } /****************************************************************************/ /* epsilon output functions (see also fields.c) */ /* get the epsilon function, and compute some statistics */ -void get_epsilon(void) -{ - int i, N, last_dim, last_dim_stored, nx, nz, local_y_start; - real *epsilon; - real eps_mean = 0, eps_inv_mean = 0, eps_high = -1e20, eps_low = 1e20; - int fill_count = 0; - - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before get-epsilon!\n"); - return; - } - - curfield = (scalar_complex *) mdata->fft_data; - epsilon = (real *) curfield; - curfield_band = 0; - curfield_type = epsilon_CURFIELD_TYPE; - - /* get epsilon. Recall that we actually have an inverse - dielectric tensor at each point; define an average index by - the inverse of the average eigenvalue of the 1/eps tensor. - i.e. 3/(trace 1/eps). */ - - N = mdata->fft_output_size; - last_dim = mdata->last_dim; - last_dim_stored = - mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - nx = mdata->nx; nz = mdata->nz; local_y_start = mdata->local_y_start; - - for (i = 0; i < N; ++i) { - if (mdata->eps_inv == NULL) - epsilon[i] = 1.0; - else - epsilon[i] = mean_medium_from_matrix(mdata->eps_inv + i); - if (epsilon[i] < eps_low) - eps_low = epsilon[i]; - if (epsilon[i] > eps_high) - eps_high = epsilon[i]; - eps_mean += epsilon[i]; - eps_inv_mean += 1/epsilon[i]; - if (epsilon[i] > 1.0001) - ++fill_count; +void get_epsilon(void) { + int i, N, last_dim, last_dim_stored, nx, nz, local_y_start; + real *epsilon; + real eps_mean = 0, eps_inv_mean = 0, eps_high = -1e20, eps_low = 1e20; + int fill_count = 0; + + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before get-epsilon!\n"); + return; + } + + curfield = (scalar_complex *)mdata->fft_data; + epsilon = (real *)curfield; + curfield_band = 0; + curfield_type = epsilon_CURFIELD_TYPE; + + /* get epsilon. Recall that we actually have an inverse + dielectric tensor at each point; define an average index by + the inverse of the average eigenvalue of the 1/eps tensor. + i.e. 3/(trace 1/eps). */ + + N = mdata->fft_output_size; + last_dim = mdata->last_dim; + last_dim_stored = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + nx = mdata->nx; + nz = mdata->nz; + local_y_start = mdata->local_y_start; + + for (i = 0; i < N; ++i) { + if (mdata->eps_inv == NULL) + epsilon[i] = 1.0; + else + epsilon[i] = mean_medium_from_matrix(mdata->eps_inv + i); + if (epsilon[i] < eps_low) eps_low = epsilon[i]; + if (epsilon[i] > eps_high) eps_high = epsilon[i]; + eps_mean += epsilon[i]; + eps_inv_mean += 1 / epsilon[i]; + if (epsilon[i] > 1.0001) ++fill_count; #ifndef SCALAR_COMPLEX - /* most points need to be counted twice, by rfftw output symmetry: */ - { - int last_index; -# ifdef HAVE_MPI - if (nz == 1) /* 2d calculation: 1st dim. is truncated one */ - last_index = i / nx + local_y_start; - else - last_index = i % last_dim_stored; -# else - last_index = i % last_dim_stored; -# endif - if (last_index != 0 && 2*last_index != last_dim) { - eps_mean += epsilon[i]; - eps_inv_mean += 1/epsilon[i]; - if (epsilon[i] > 1.0001) - ++fill_count; - } - } + /* most points need to be counted twice, by rfftw output symmetry: */ + { + int last_index; +#ifdef HAVE_MPI + if (nz == 1) /* 2d calculation: 1st dim. is truncated one */ + last_index = i / nx + local_y_start; + else + last_index = i % last_dim_stored; +#else + last_index = i % last_dim_stored; #endif - } - - mpi_allreduce_1(&eps_mean, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - mpi_allreduce_1(&eps_inv_mean, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - mpi_allreduce_1(&eps_low, real, SCALAR_MPI_TYPE, - MPI_MIN, mpb_comm); - mpi_allreduce_1(&eps_high, real, SCALAR_MPI_TYPE, - MPI_MAX, mpb_comm); - mpi_allreduce_1(&fill_count, int, MPI_INT, - MPI_SUM, mpb_comm); - N = mdata->nx * mdata->ny * mdata->nz; - eps_mean /= N; - eps_inv_mean = N/eps_inv_mean; - - mpi_one_printf("epsilon: %g-%g, mean %g, harm. mean %g, " - "%g%% > 1, %g%% \"fill\"\n", - eps_low, eps_high, eps_mean, eps_inv_mean, - (100.0 * fill_count) / N, - eps_high == eps_low ? 100.0 : - 100.0 * (eps_mean-eps_low) / (eps_high-eps_low)); + if (last_index != 0 && 2 * last_index != last_dim) { + eps_mean += epsilon[i]; + eps_inv_mean += 1 / epsilon[i]; + if (epsilon[i] > 1.0001) ++fill_count; + } + } +#endif + } + + mpi_allreduce_1(&eps_mean, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + mpi_allreduce_1(&eps_inv_mean, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + mpi_allreduce_1(&eps_low, real, SCALAR_MPI_TYPE, MPI_MIN, mpb_comm); + mpi_allreduce_1(&eps_high, real, SCALAR_MPI_TYPE, MPI_MAX, mpb_comm); + mpi_allreduce_1(&fill_count, int, MPI_INT, MPI_SUM, mpb_comm); + N = mdata->nx * mdata->ny * mdata->nz; + eps_mean /= N; + eps_inv_mean = N / eps_inv_mean; + + mpi_one_printf("epsilon: %g-%g, mean %g, harm. mean %g, " + "%g%% > 1, %g%% \"fill\"\n", + eps_low, eps_high, eps_mean, eps_inv_mean, (100.0 * fill_count) / N, + eps_high == eps_low ? 100.0 : 100.0 * (eps_mean - eps_low) / (eps_high - eps_low)); } /* get the specified component of the dielectric tensor, or the inverse tensor if inv != 0 */ -void get_epsilon_tensor(int c1, int c2, int imag, int inv) -{ - int i, N; - real *epsilon; - int conj = 0, offset = 0; - - curfield_type = '-'; /* only used internally, for now */ - epsilon = (real *) mdata->fft_data; - N = mdata->fft_output_size; - - switch (c1 * 3 + c2) { - case 0: - offset = offsetof(symmetric_matrix, m00); - break; - case 1: - offset = offsetof(symmetric_matrix, m01); - break; - case 2: - offset = offsetof(symmetric_matrix, m02); - break; - case 3: - offset = offsetof(symmetric_matrix, m01); /* = conj(m10) */ - conj = imag; - break; - case 4: - offset = offsetof(symmetric_matrix, m11); - break; - case 5: - offset = offsetof(symmetric_matrix, m12); - break; - case 6: - offset = offsetof(symmetric_matrix, m02); /* = conj(m20) */ - conj = imag; - break; - case 7: - offset = offsetof(symmetric_matrix, m12); /* = conj(m21) */ - conj = imag; - break; - case 8: - offset = offsetof(symmetric_matrix, m22); - break; - } +void get_epsilon_tensor(int c1, int c2, int imag, int inv) { + int i, N; + real *epsilon; + int conj = 0, offset = 0; + + curfield_type = '-'; /* only used internally, for now */ + epsilon = (real *)mdata->fft_data; + N = mdata->fft_output_size; + + switch (c1 * 3 + c2) { + case 0: offset = offsetof(symmetric_matrix, m00); break; + case 1: offset = offsetof(symmetric_matrix, m01); break; + case 2: offset = offsetof(symmetric_matrix, m02); break; + case 3: + offset = offsetof(symmetric_matrix, m01); /* = conj(m10) */ + conj = imag; + break; + case 4: offset = offsetof(symmetric_matrix, m11); break; + case 5: offset = offsetof(symmetric_matrix, m12); break; + case 6: + offset = offsetof(symmetric_matrix, m02); /* = conj(m20) */ + conj = imag; + break; + case 7: + offset = offsetof(symmetric_matrix, m12); /* = conj(m21) */ + conj = imag; + break; + case 8: offset = offsetof(symmetric_matrix, m22); break; + } #ifdef WITH_HERMITIAN_EPSILON - if (c1 != c2 && imag) - offset += offsetof(scalar_complex, im); + if (c1 != c2 && imag) offset += offsetof(scalar_complex, im); #endif - for (i = 0; i < N; ++i) { - if (inv) { - epsilon[i] = - *((real *) (((char *) &mdata->eps_inv[i]) + offset)); - } - else { - symmetric_matrix eps; - maxwell_sym_matrix_invert(&eps, &mdata->eps_inv[i]); - epsilon[i] = *((real *) (((char *) &eps) + offset)); - } - if (conj) - epsilon[i] = -epsilon[i]; - } + for (i = 0; i < N; ++i) { + if (inv) { epsilon[i] = *((real *)(((char *)&mdata->eps_inv[i]) + offset)); } + else { + symmetric_matrix eps; + maxwell_sym_matrix_invert(&eps, &mdata->eps_inv[i]); + epsilon[i] = *((real *)(((char *)&eps) + offset)); + } + if (conj) epsilon[i] = -epsilon[i]; + } } - diff --git a/mpb/epsilon_file.c b/mpb/epsilon_file.c index 57747808..0288e43f 100644 --- a/mpb/epsilon_file.c +++ b/mpb/epsilon_file.c @@ -41,137 +41,150 @@ #include "mpb.h" typedef struct { - int nx, ny, nz; - real *data; + int nx, ny, nz; + real *data; } epsilon_file_data; /* Linearly interpolate a given point in a 3d grid of data. The point coordinates should be in the range [0,1], or at the very least [-1,2] ... anything outside [0,1] is *mirror* reflected into [0,1] */ -real linear_interpolate(real rx, real ry, real rz, - real *data, int nx, int ny, int nz, int stride) -{ - int x, y, z, x2, y2, z2; - real dx, dy, dz; - - /* mirror boundary conditions for r just beyond the boundary */ - if (rx < 0.0) rx = -rx; else if (rx > 1.0) rx = 1.0 - rx; - if (ry < 0.0) ry = -ry; else if (ry > 1.0) ry = 1.0 - ry; - if (rz < 0.0) rz = -rz; else if (rz > 1.0) rz = 1.0 - rz; - - /* get the point corresponding to r in the epsilon array grid: */ - x = rx * nx; if (x == nx) --x; - y = ry * ny; if (y == ny) --y; - z = rz * nz; if (z == nz) --z; - - /* get the difference between (x,y,z) and the actual point - ... we shift by 0.5 to center the data points in the pixels */ - dx = rx * nx - x - 0.5; - dy = ry * ny - y - 0.5; - dz = rz * nz - z - 0.5; - - /* get the other closest point in the grid, with mirror boundaries: */ - x2 = (dx >= 0.0 ? x + 1 : x - 1); - if (x2 < 0) x2++; else if (x2 == nx) x2--; - y2 = (dy >= 0.0 ? y + 1 : y - 1); - if (y2 < 0) y2++; else if (y2 == ny) y2--; - z2 = (dz >= 0.0 ? z + 1 : z - 1); - if (z2 < 0) z2++; else if (z2 == nz) z2--; - - /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ - dx = fabs(dx); - dy = fabs(dy); - dz = fabs(dz); - - /* define a macro to give us data(x,y,z) on the grid, - in row-major order (the order used by HDF5): */ -#define D(x,y,z) (data[(((x)*ny + (y))*nz + (z)) * stride]) - - return(((D(x,y,z)*(1.0-dx) + D(x2,y,z)*dx) * (1.0-dy) + - (D(x,y2,z)*(1.0-dx) + D(x2,y2,z)*dx) * dy) * (1.0-dz) + - ((D(x,y,z2)*(1.0-dx) + D(x2,y,z2)*dx) * (1.0-dy) + - (D(x,y2,z2)*(1.0-dx) + D(x2,y2,z2)*dx) * dy) * dz); +real linear_interpolate(real rx, real ry, real rz, real *data, int nx, int ny, int nz, int stride) { + int x, y, z, x2, y2, z2; + real dx, dy, dz; + + /* mirror boundary conditions for r just beyond the boundary */ + if (rx < 0.0) + rx = -rx; + else if (rx > 1.0) + rx = 1.0 - rx; + if (ry < 0.0) + ry = -ry; + else if (ry > 1.0) + ry = 1.0 - ry; + if (rz < 0.0) + rz = -rz; + else if (rz > 1.0) + rz = 1.0 - rz; + + /* get the point corresponding to r in the epsilon array grid: */ + x = rx * nx; + if (x == nx) --x; + y = ry * ny; + if (y == ny) --y; + z = rz * nz; + if (z == nz) --z; + + /* get the difference between (x,y,z) and the actual point + ... we shift by 0.5 to center the data points in the pixels */ + dx = rx * nx - x - 0.5; + dy = ry * ny - y - 0.5; + dz = rz * nz - z - 0.5; + + /* get the other closest point in the grid, with mirror boundaries: */ + x2 = (dx >= 0.0 ? x + 1 : x - 1); + if (x2 < 0) + x2++; + else if (x2 == nx) + x2--; + y2 = (dy >= 0.0 ? y + 1 : y - 1); + if (y2 < 0) + y2++; + else if (y2 == ny) + y2--; + z2 = (dz >= 0.0 ? z + 1 : z - 1); + if (z2 < 0) + z2++; + else if (z2 == nz) + z2--; + + /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ + dx = fabs(dx); + dy = fabs(dy); + dz = fabs(dz); + + /* define a macro to give us data(x,y,z) on the grid, + in row-major order (the order used by HDF5): */ +#define D(x, y, z) (data[(((x)*ny + (y)) * nz + (z)) * stride]) + + return (((D(x, y, z) * (1.0 - dx) + D(x2, y, z) * dx) * (1.0 - dy) + + (D(x, y2, z) * (1.0 - dx) + D(x2, y2, z) * dx) * dy) * + (1.0 - dz) + + ((D(x, y, z2) * (1.0 - dx) + D(x2, y, z2) * dx) * (1.0 - dy) + + (D(x, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx) * dy) * + dz); #undef D } -static void epsilon_file_func(symmetric_matrix *eps, symmetric_matrix *eps_inv, - const real r[3], void *edata) -{ - epsilon_file_data *d = (epsilon_file_data *) edata; - real rx, ry, rz; - real eps_val; - - /* make sure r is positive: */ - rx = r[0] >= 0.0 ? r[0] : (r[0] + (1 + (int) (-r[0]))); - ry = r[1] >= 0.0 ? r[1] : (r[1] + (1 + (int) (-r[1]))); - rz = r[2] >= 0.0 ? r[2] : (r[2] + (1 + (int) (-r[2]))); - - /* make sure r is in [0,1) */ - rx = rx < 1.0 ? rx : rx - ((int) rx); - ry = ry < 1.0 ? ry : ry - ((int) ry); - rz = rz < 1.0 ? rz : rz - ((int) rz); - - eps_val = linear_interpolate(rx,ry,rz, d->data, d->nx,d->ny,d->nz, 1); - eps->m00 = eps->m11 = eps->m22 = eps_val; - eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; +static void epsilon_file_func(symmetric_matrix *eps, symmetric_matrix *eps_inv, const real r[3], + void *edata) { + epsilon_file_data *d = (epsilon_file_data *)edata; + real rx, ry, rz; + real eps_val; + + /* make sure r is positive: */ + rx = r[0] >= 0.0 ? r[0] : (r[0] + (1 + (int)(-r[0]))); + ry = r[1] >= 0.0 ? r[1] : (r[1] + (1 + (int)(-r[1]))); + rz = r[2] >= 0.0 ? r[2] : (r[2] + (1 + (int)(-r[2]))); + + /* make sure r is in [0,1) */ + rx = rx < 1.0 ? rx : rx - ((int)rx); + ry = ry < 1.0 ? ry : ry - ((int)ry); + rz = rz < 1.0 ? rz : rz - ((int)rz); + + eps_val = linear_interpolate(rx, ry, rz, d->data, d->nx, d->ny, d->nz, 1); + eps->m00 = eps->m11 = eps->m22 = eps_val; + eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; #ifdef WITH_HERMITIAN_EPSILON - CASSIGN_ZERO(eps->m01); - CASSIGN_ZERO(eps->m02); - CASSIGN_ZERO(eps->m12); - CASSIGN_ZERO(eps_inv->m01); - CASSIGN_ZERO(eps_inv->m02); - CASSIGN_ZERO(eps_inv->m12); + CASSIGN_ZERO(eps->m01); + CASSIGN_ZERO(eps->m02); + CASSIGN_ZERO(eps->m12); + CASSIGN_ZERO(eps_inv->m01); + CASSIGN_ZERO(eps_inv->m02); + CASSIGN_ZERO(eps_inv->m12); #else - eps->m01 = eps->m02 = eps->m12 = 0.0; - eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; + eps->m01 = eps->m02 = eps->m12 = 0.0; + eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; #endif } -void get_epsilon_file_func(const char *fname, - maxwell_dielectric_function *func, - void **func_data) -{ - if (fname && fname[0]) { - char *eps_fname; - matrixio_id file_id; - epsilon_file_data *d; - int rank = 3, dims[3]; - - CHK_MALLOC(d, epsilon_file_data, 1); - - eps_fname = ctl_fix_path(fname); - mpi_one_printf("Using background dielectric from file \"%s\"...\n", - eps_fname); - file_id = matrixio_open(eps_fname, 1); - free(eps_fname); - - d->data = matrixio_read_real_data(file_id, NULL, &rank, dims, - 0,0,0, NULL); - CHECK(d->data, "couldn't find dataset in dielectric file"); - matrixio_close(file_id); - - d->nx = rank >= 1 ? dims[0] : 1; - d->ny = rank >= 2 ? dims[1] : 1; - d->nz = rank >= 3 ? dims[2] : 1; - - mpi_one_printf(" ...read %dx%dx%d dielectric function\n", - d->nx, d->ny, d->nz); - - *func = epsilon_file_func; - *func_data = (void*) d; - } - else { - *func = NULL; - *func_data = NULL; - } +void get_epsilon_file_func(const char *fname, maxwell_dielectric_function *func, void **func_data) { + if (fname && fname[0]) { + char *eps_fname; + matrixio_id file_id; + epsilon_file_data *d; + int rank = 3, dims[3]; + + CHK_MALLOC(d, epsilon_file_data, 1); + + eps_fname = ctl_fix_path(fname); + mpi_one_printf("Using background dielectric from file \"%s\"...\n", eps_fname); + file_id = matrixio_open(eps_fname, 1); + free(eps_fname); + + d->data = matrixio_read_real_data(file_id, NULL, &rank, dims, 0, 0, 0, NULL); + CHECK(d->data, "couldn't find dataset in dielectric file"); + matrixio_close(file_id); + + d->nx = rank >= 1 ? dims[0] : 1; + d->ny = rank >= 2 ? dims[1] : 1; + d->nz = rank >= 3 ? dims[2] : 1; + + mpi_one_printf(" ...read %dx%dx%d dielectric function\n", d->nx, d->ny, d->nz); + + *func = epsilon_file_func; + *func_data = (void *)d; + } + else { + *func = NULL; + *func_data = NULL; + } } -void destroy_epsilon_file_func_data(void *func_data) -{ - epsilon_file_data *d = (epsilon_file_data *) func_data; - if (d) { - free(d->data); - free(d); - } +void destroy_epsilon_file_func_data(void *func_data) { + epsilon_file_data *d = (epsilon_file_data *)func_data; + if (d) { + free(d->data); + free(d); + } } diff --git a/mpb/field-smob.c b/mpb/field-smob.c index 2e568f3e..878e0c35 100644 --- a/mpb/field-smob.c +++ b/mpb/field-smob.c @@ -30,602 +30,540 @@ #ifndef HAVE_SCM_MAKE_SMOB_TYPE /* null mark function, for smobs containing no SCM objects */ -static SCM mark_null(SCM obj) { (void) obj; return SCM_BOOL_F; } +static SCM mark_null(SCM obj) { + (void)obj; + return SCM_BOOL_F; +} #endif /*************************************************************************/ long scm_tc16_smob_field_smob = 0; -static SCM field_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(FIELD_P(obj)); -} - -static SCM rscalar_field_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(RSCALAR_FIELD_P(obj)); -} - -static SCM cscalar_field_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(CSCALAR_FIELD_P(obj)); -} - -static SCM cvector_field_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(CVECTOR_FIELD_P(obj)); -} - -static int print_field_smob(SCM obj, SCM port, scm_print_state *pstate) -{ - char buf[256]; - field_smob *pf = FIELD(obj); - (void) pstate; /* unused argument */ - - scm_puts("#nx, pf->ny, pf->nz); - scm_puts(buf, port); - switch (pf->type) { - case RSCALAR_FIELD_SMOB: - scm_puts(" real scalar field", port); - break; - case CSCALAR_FIELD_SMOB: - scm_puts(" complex scalar field", port); - break; - case CVECTOR_FIELD_SMOB: - scm_puts(" complex vector field", port); - break; - } - if (pf->local_ny < pf->ny) { - sprintf(buf, ", y=%d-%d local", - pf->local_y_start, pf->local_y_start + pf->local_ny - 1); - scm_puts(buf, port); - } - scm_putc('>', port); - return 1; +static SCM field_p(SCM obj) { return ctl_convert_boolean_to_scm(FIELD_P(obj)); } + +static SCM rscalar_field_p(SCM obj) { return ctl_convert_boolean_to_scm(RSCALAR_FIELD_P(obj)); } + +static SCM cscalar_field_p(SCM obj) { return ctl_convert_boolean_to_scm(CSCALAR_FIELD_P(obj)); } + +static SCM cvector_field_p(SCM obj) { return ctl_convert_boolean_to_scm(CVECTOR_FIELD_P(obj)); } + +static int print_field_smob(SCM obj, SCM port, scm_print_state *pstate) { + char buf[256]; + field_smob *pf = FIELD(obj); + (void)pstate; /* unused argument */ + + scm_puts("#nx, pf->ny, pf->nz); + scm_puts(buf, port); + switch (pf->type) { + case RSCALAR_FIELD_SMOB: scm_puts(" real scalar field", port); break; + case CSCALAR_FIELD_SMOB: scm_puts(" complex scalar field", port); break; + case CVECTOR_FIELD_SMOB: scm_puts(" complex vector field", port); break; + } + if (pf->local_ny < pf->ny) { + sprintf(buf, ", y=%d-%d local", pf->local_y_start, pf->local_y_start + pf->local_ny - 1); + scm_puts(buf, port); + } + scm_putc('>', port); + return 1; } -static size_t free_field_smob(SCM obj) -{ - field_smob *pf = FIELD(obj); - free(pf->f.rs); - free(pf); - return 0; +static size_t free_field_smob(SCM obj) { + field_smob *pf = FIELD(obj); + free(pf->f.rs); + free(pf); + return 0; } #define mark_field_smob mark_null -SCM field2scm(field_smob *pf) -{ - SCM obj; - NEWCELL_SMOB(obj, field_smob, pf); - return obj; +SCM field2scm(field_smob *pf) { + SCM obj; + NEWCELL_SMOB(obj, field_smob, pf); + return obj; } /*************************************************************************/ -void register_field_smobs(void) -{ +void register_field_smobs(void) { #ifdef HAVE_SCM_MAKE_SMOB_TYPE - scm_tc16_smob_field_smob = scm_make_smob_type("field", 0); - scm_set_smob_free(scm_tc16_smob_field_smob, free_field_smob); - scm_set_smob_print(scm_tc16_smob_field_smob, print_field_smob); + scm_tc16_smob_field_smob = scm_make_smob_type("field", 0); + scm_set_smob_free(scm_tc16_smob_field_smob, free_field_smob); + scm_set_smob_print(scm_tc16_smob_field_smob, print_field_smob); #else /* old way to register smobs */ - MAKE_SMOBFUNS(field_smob); - REGISTER_SMOBFUNS(field_smob); + MAKE_SMOBFUNS(field_smob); + REGISTER_SMOBFUNS(field_smob); #endif - gh_new_procedure("field?", field_p, 1, 0, 0); - gh_new_procedure("rscalar-field?", rscalar_field_p, 1, 0, 0); - gh_new_procedure("cscalar-field?", cscalar_field_p, 1, 0, 0); - gh_new_procedure("cvector-field?", cvector_field_p, 1, 0, 0); + gh_new_procedure("field?", field_p, 1, 0, 0); + gh_new_procedure("rscalar-field?", rscalar_field_p, 1, 0, 0); + gh_new_procedure("cscalar-field?", cscalar_field_p, 1, 0, 0); + gh_new_procedure("cvector-field?", cvector_field_p, 1, 0, 0); } /*************************************************************************/ static field_smob curfield_smob; -field_smob *update_curfield_smob(void) -{ - CHECK(mdata, "init-params must be called before manipulating fields"); - curfield_smob.nx = mdata->nx; - curfield_smob.ny = mdata->ny; - curfield_smob.nz = mdata->nz; - curfield_smob.N = mdata->fft_output_size; - curfield_smob.local_ny = mdata->local_ny; - curfield_smob.local_y_start = mdata->local_y_start; - curfield_smob.last_dim = mdata->last_dim; - curfield_smob.last_dim_size = mdata->last_dim_size; - curfield_smob.other_dims = mdata->other_dims; - curfield_smob.type_char = curfield_type; - if (strchr("dhbecv", curfield_type)) { /* complex vector field */ - curfield_smob.type = CVECTOR_FIELD_SMOB; - curfield_smob.f.cv = curfield; - } - else if (strchr("DHBnR", curfield_type)) { /* real scalar field */ - curfield_smob.type = RSCALAR_FIELD_SMOB; - curfield_smob.f.rs = (real *) curfield; - } - else if (strchr("C", curfield_type)) { /* complex scalar field */ - curfield_smob.type = CSCALAR_FIELD_SMOB; - curfield_smob.f.cs = curfield; - } - else { - curfield_smob.type = RSCALAR_FIELD_SMOB; /* arbitrary */ - curfield_smob.f.rs = (real *) curfield; - if (!curfield_smob.f.rs) - curfield_smob.f.rs = (real *) mdata->fft_data; - return 0; - } - return &curfield_smob; +field_smob *update_curfield_smob(void) { + CHECK(mdata, "init-params must be called before manipulating fields"); + curfield_smob.nx = mdata->nx; + curfield_smob.ny = mdata->ny; + curfield_smob.nz = mdata->nz; + curfield_smob.N = mdata->fft_output_size; + curfield_smob.local_ny = mdata->local_ny; + curfield_smob.local_y_start = mdata->local_y_start; + curfield_smob.last_dim = mdata->last_dim; + curfield_smob.last_dim_size = mdata->last_dim_size; + curfield_smob.other_dims = mdata->other_dims; + curfield_smob.type_char = curfield_type; + if (strchr("dhbecv", curfield_type)) { /* complex vector field */ + curfield_smob.type = CVECTOR_FIELD_SMOB; + curfield_smob.f.cv = curfield; + } + else if (strchr("DHBnR", curfield_type)) { /* real scalar field */ + curfield_smob.type = RSCALAR_FIELD_SMOB; + curfield_smob.f.rs = (real *)curfield; + } + else if (strchr("C", curfield_type)) { /* complex scalar field */ + curfield_smob.type = CSCALAR_FIELD_SMOB; + curfield_smob.f.cs = curfield; + } + else { + curfield_smob.type = RSCALAR_FIELD_SMOB; /* arbitrary */ + curfield_smob.f.rs = (real *)curfield; + if (!curfield_smob.f.rs) curfield_smob.f.rs = (real *)mdata->fft_data; + return 0; + } + return &curfield_smob; } -static void update_curfield(field_smob *pf) -{ - if (pf == &curfield_smob) { - curfield_type = curfield_smob.type_char; - curfield = curfield_smob.f.cv; - } +static void update_curfield(field_smob *pf) { + if (pf == &curfield_smob) { + curfield_type = curfield_smob.type_char; + curfield = curfield_smob.f.cv; + } } -boolean cur_fieldp(SCM obj) -{ - if (SCM_NIMP(obj) && SCM_SYMBOLP(obj)) { - char *s = ctl_symbol2newstr(obj); - int ret = !strcmp(s, "cur-field"); - free(s); - return ret; - } - return 0; +boolean cur_fieldp(SCM obj) { + if (SCM_NIMP(obj) && SCM_SYMBOLP(obj)) { + char *s = ctl_symbol2newstr(obj); + int ret = !strcmp(s, "cur-field"); + free(s); + return ret; + } + return 0; } /*************************************************************************/ -field_smob *assert_field_smob(SCM fo) -{ - field_smob *f = SAFE_FIELD(fo); - CHECK(f, "wrong type argument: expecting field"); - return f; +field_smob *assert_field_smob(SCM fo) { + field_smob *f = SAFE_FIELD(fo); + CHECK(f, "wrong type argument: expecting field"); + return f; } /*************************************************************************/ -SCM rscalar_field_make(SCM f0) -{ - int i; - field_smob *pf; - field_smob *pf0 = assert_field_smob(f0); - CHK_MALLOC(pf, field_smob, 1); - *pf = *pf0; - pf->type = RSCALAR_FIELD_SMOB; - pf->type_char = 'R'; - CHK_MALLOC(pf->f.rs, real, pf->N); - for (i = 0; i < pf->N; ++i) - pf->f.rs[i] = 0.0; - scm_remember_upto_here_1(f0); - return field2scm(pf); +SCM rscalar_field_make(SCM f0) { + int i; + field_smob *pf; + field_smob *pf0 = assert_field_smob(f0); + CHK_MALLOC(pf, field_smob, 1); + *pf = *pf0; + pf->type = RSCALAR_FIELD_SMOB; + pf->type_char = 'R'; + CHK_MALLOC(pf->f.rs, real, pf->N); + for (i = 0; i < pf->N; ++i) + pf->f.rs[i] = 0.0; + scm_remember_upto_here_1(f0); + return field2scm(pf); } -SCM cscalar_field_make(SCM f0) -{ - int i; - field_smob *pf; - field_smob *pf0 = assert_field_smob(f0); - CHK_MALLOC(pf, field_smob, 1); - *pf = *pf0; - pf->type = CSCALAR_FIELD_SMOB; - pf->type_char = 'C'; - CHK_MALLOC(pf->f.cs, scalar_complex, pf->N); - for (i = 0; i < pf->N; ++i) - CASSIGN_ZERO(pf->f.cs[i]); - scm_remember_upto_here_1(f0); - return field2scm(pf); +SCM cscalar_field_make(SCM f0) { + int i; + field_smob *pf; + field_smob *pf0 = assert_field_smob(f0); + CHK_MALLOC(pf, field_smob, 1); + *pf = *pf0; + pf->type = CSCALAR_FIELD_SMOB; + pf->type_char = 'C'; + CHK_MALLOC(pf->f.cs, scalar_complex, pf->N); + for (i = 0; i < pf->N; ++i) + CASSIGN_ZERO(pf->f.cs[i]); + scm_remember_upto_here_1(f0); + return field2scm(pf); } -SCM cvector_field_make(SCM f0) -{ - int i; - field_smob *pf; - field_smob *pf0 = assert_field_smob(f0); - CHECK(mdata, "init-params must be called before cvector-field-make"); - CHK_MALLOC(pf, field_smob, 1); - *pf = *pf0; - pf->type = CVECTOR_FIELD_SMOB; - pf->type_char = 'c'; - CHK_MALLOC(pf->f.cv, scalar_complex, 3 * pf->N); - for (i = 0; i < pf->N * 3; ++i) - CASSIGN_ZERO(pf->f.cv[i]); - scm_remember_upto_here_1(f0); - return field2scm(pf); +SCM cvector_field_make(SCM f0) { + int i; + field_smob *pf; + field_smob *pf0 = assert_field_smob(f0); + CHECK(mdata, "init-params must be called before cvector-field-make"); + CHK_MALLOC(pf, field_smob, 1); + *pf = *pf0; + pf->type = CVECTOR_FIELD_SMOB; + pf->type_char = 'c'; + CHK_MALLOC(pf->f.cv, scalar_complex, 3 * pf->N); + for (i = 0; i < pf->N * 3; ++i) + CASSIGN_ZERO(pf->f.cv[i]); + scm_remember_upto_here_1(f0); + return field2scm(pf); } -void cvector_field_nonblochB(SCM f) -{ - field_smob *pf = assert_field_smob(f); - pf->type_char = 'v'; - update_curfield(pf); - scm_remember_upto_here_1(f); +void cvector_field_nonblochB(SCM f) { + field_smob *pf = assert_field_smob(f); + pf->type_char = 'v'; + update_curfield(pf); + scm_remember_upto_here_1(f); } -SCM field_make(SCM f0) -{ - field_smob *pf0 = assert_field_smob(f0); - switch (pf0->type) { - case RSCALAR_FIELD_SMOB: - return rscalar_field_make(f0); - case CSCALAR_FIELD_SMOB: - return cscalar_field_make(f0); - case CVECTOR_FIELD_SMOB: - return cvector_field_make(f0); - } - scm_remember_upto_here_1(f0); - return SCM_UNDEFINED; +SCM field_make(SCM f0) { + field_smob *pf0 = assert_field_smob(f0); + switch (pf0->type) { + case RSCALAR_FIELD_SMOB: return rscalar_field_make(f0); + case CSCALAR_FIELD_SMOB: return cscalar_field_make(f0); + case CVECTOR_FIELD_SMOB: return cvector_field_make(f0); + } + scm_remember_upto_here_1(f0); + return SCM_UNDEFINED; } -static boolean fields_conform(field_smob *f1, field_smob *f2) -{ +static boolean fields_conform(field_smob *f1, field_smob *f2) { #define EQF(field) (f1->field == f2->field) - return (EQF(nx) && EQF(ny) && EQF(nz) && - EQF(N) && EQF(local_ny) && EQF(local_y_start) && - EQF(last_dim) && EQF(last_dim_size) && EQF(other_dims)); + return (EQF(nx) && EQF(ny) && EQF(nz) && EQF(N) && EQF(local_ny) && EQF(local_y_start) && + EQF(last_dim) && EQF(last_dim_size) && EQF(other_dims)); #undef EQF } -boolean fields_conformp(SCM f1o, SCM f2o) -{ - field_smob *f1 = assert_field_smob(f1o); - field_smob *f2 = assert_field_smob(f2o); - boolean ret = fields_conform(f1, f2); - scm_remember_upto_here_2(f1o, f2o); - return ret; +boolean fields_conformp(SCM f1o, SCM f2o) { + field_smob *f1 = assert_field_smob(f1o); + field_smob *f2 = assert_field_smob(f2o); + boolean ret = fields_conform(f1, f2); + scm_remember_upto_here_2(f1o, f2o); + return ret; } -static void field_set(field_smob *fd, field_smob *fs) -{ - int i; - - CHECK(fd->type == fs->type && fields_conform(fd, fs), - "fields for field-set! must conform"); - switch (fs->type) { - case RSCALAR_FIELD_SMOB: - CHECK(fs->type_char != '-', "must load field for field-set!"); - for (i = 0; i < fs->N; ++i) - fd->f.rs[i] = fs->f.rs[i]; - break; - case CSCALAR_FIELD_SMOB: - CHECK(fs->type_char != '-', "must load field for field-set!"); - for (i = 0; i < fs->N; ++i) - fd->f.cs[i] = fs->f.cs[i]; - break; - case CVECTOR_FIELD_SMOB: - CHECK(fs->type_char != '-', "must load field for field-set!"); - for (i = 0; i < fs->N * 3; ++i) - fd->f.cv[i] = fs->f.cv[i]; - break; - } - fd->type_char = fs->type_char; - update_curfield(fd); +static void field_set(field_smob *fd, field_smob *fs) { + int i; + + CHECK(fd->type == fs->type && fields_conform(fd, fs), "fields for field-set! must conform"); + switch (fs->type) { + case RSCALAR_FIELD_SMOB: + CHECK(fs->type_char != '-', "must load field for field-set!"); + for (i = 0; i < fs->N; ++i) + fd->f.rs[i] = fs->f.rs[i]; + break; + case CSCALAR_FIELD_SMOB: + CHECK(fs->type_char != '-', "must load field for field-set!"); + for (i = 0; i < fs->N; ++i) + fd->f.cs[i] = fs->f.cs[i]; + break; + case CVECTOR_FIELD_SMOB: + CHECK(fs->type_char != '-', "must load field for field-set!"); + for (i = 0; i < fs->N * 3; ++i) + fd->f.cv[i] = fs->f.cv[i]; + break; + } + fd->type_char = fs->type_char; + update_curfield(fd); } -void field_setB(SCM dest, SCM src) -{ - field_smob *fd = assert_field_smob(dest); - field_smob *fs = assert_field_smob(src); - field_set(fd, fs); - scm_remember_upto_here_2(dest, src); +void field_setB(SCM dest, SCM src) { + field_smob *fd = assert_field_smob(dest); + field_smob *fs = assert_field_smob(src); + field_set(fd, fs); + scm_remember_upto_here_2(dest, src); } -void field_load(SCM src) -{ - field_smob *fs = assert_field_smob(src); - CHECK(mdata, "init-params must be called before field-load"); - update_curfield_smob(); - CHECK(fields_conform(fs, &curfield_smob), - "argument for field-load must conform to current size"); - curfield_smob.type = fs->type; - field_set(&curfield_smob, fs); - scm_remember_upto_here_1(src); +void field_load(SCM src) { + field_smob *fs = assert_field_smob(src); + CHECK(mdata, "init-params must be called before field-load"); + update_curfield_smob(); + CHECK(fields_conform(fs, &curfield_smob), "argument for field-load must conform to current size"); + curfield_smob.type = fs->type; + field_set(&curfield_smob, fs); + scm_remember_upto_here_1(src); } -void field_mapLB(SCM dest, function f, SCM_list src) -{ - field_smob *pd = assert_field_smob(dest); - field_smob **ps; - int i, j; - CHK_MALLOC(ps, field_smob *, src.num_items); - for (j = 0; j < src.num_items; ++j) { - ps[j] = assert_field_smob(src.items[j]); - CHECK(fields_conform(pd, ps[j]), - "fields for field-map! must conform"); - } - for (i = 0; i < pd->N; ++i) { - list arg_list = SCM_EOL; - SCM result; - for (j = src.num_items - 1; j >= 0; --j) { - SCM item = SCM_EOL; - switch (ps[j]->type) { - case RSCALAR_FIELD_SMOB: - item = ctl_convert_number_to_scm(ps[j]->f.rs[i]); - break; - case CSCALAR_FIELD_SMOB: - item = cnumber2scm(cscalar2cnumber(ps[j]->f.cs[i])); - break; - case CVECTOR_FIELD_SMOB: - item = - cvector32scm(cscalar32cvector3(ps[j]->f.cv+3*i)); - break; - } - arg_list = gh_cons(item, arg_list); - } - result = gh_apply(f, arg_list); - switch (pd->type) { - case RSCALAR_FIELD_SMOB: - pd->f.rs[i] = ctl_convert_number_to_c(result); - break; - case CSCALAR_FIELD_SMOB: - pd->f.cs[i] = cnumber2cscalar(scm2cnumber(result)); - break; - case CVECTOR_FIELD_SMOB: - cvector32cscalar3(pd->f.cv+3*i, scm2cvector3(result)); - break; - } - } - if (src.num_items == 1 && ps[0]->type == pd->type) - pd->type_char = ps[0]->type_char; - else if (src.num_items > 1) - switch (pd->type) { - case RSCALAR_FIELD_SMOB: - pd->type_char = 'R'; - break; - case CSCALAR_FIELD_SMOB: - pd->type_char = 'C'; - break; - case CVECTOR_FIELD_SMOB: - pd->type_char = 'c'; - break; - } - free(ps); - update_curfield(pd); - scm_remember_upto_here_1(dest); +void field_mapLB(SCM dest, function f, SCM_list src) { + field_smob *pd = assert_field_smob(dest); + field_smob **ps; + int i, j; + CHK_MALLOC(ps, field_smob *, src.num_items); + for (j = 0; j < src.num_items; ++j) { + ps[j] = assert_field_smob(src.items[j]); + CHECK(fields_conform(pd, ps[j]), "fields for field-map! must conform"); + } + for (i = 0; i < pd->N; ++i) { + list arg_list = SCM_EOL; + SCM result; + for (j = src.num_items - 1; j >= 0; --j) { + SCM item = SCM_EOL; + switch (ps[j]->type) { + case RSCALAR_FIELD_SMOB: item = ctl_convert_number_to_scm(ps[j]->f.rs[i]); break; + case CSCALAR_FIELD_SMOB: item = cnumber2scm(cscalar2cnumber(ps[j]->f.cs[i])); break; + case CVECTOR_FIELD_SMOB: item = cvector32scm(cscalar32cvector3(ps[j]->f.cv + 3 * i)); break; + } + arg_list = gh_cons(item, arg_list); + } + result = gh_apply(f, arg_list); + switch (pd->type) { + case RSCALAR_FIELD_SMOB: pd->f.rs[i] = ctl_convert_number_to_c(result); break; + case CSCALAR_FIELD_SMOB: pd->f.cs[i] = cnumber2cscalar(scm2cnumber(result)); break; + case CVECTOR_FIELD_SMOB: cvector32cscalar3(pd->f.cv + 3 * i, scm2cvector3(result)); break; + } + } + if (src.num_items == 1 && ps[0]->type == pd->type) + pd->type_char = ps[0]->type_char; + else if (src.num_items > 1) + switch (pd->type) { + case RSCALAR_FIELD_SMOB: pd->type_char = 'R'; break; + case CSCALAR_FIELD_SMOB: pd->type_char = 'C'; break; + case CVECTOR_FIELD_SMOB: pd->type_char = 'c'; break; + } + free(ps); + update_curfield(pd); + scm_remember_upto_here_1(dest); } /*************************************************************************/ -static cvector3 cvector3_conj(cvector3 c) -{ - cvector3 cc; - cc.x = cnumber_conj(c.x); - cc.y = cnumber_conj(c.y); - cc.z = cnumber_conj(c.z); - return cc; +static cvector3 cvector3_conj(cvector3 c) { + cvector3 cc; + cc.x = cnumber_conj(c.x); + cc.y = cnumber_conj(c.y); + cc.z = cnumber_conj(c.z); + return cc; } /* Compute the integral of f(r, {fields}) over the cell. */ -cnumber integrate_fieldL(function f, SCM_list fields) -{ - int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; +cnumber integrate_fieldL(function f, SCM_list fields) { + int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif - real s1, s2, s3, c1, c2, c3; - int ifield; - field_smob **pf; - cnumber integral = {0,0}; - - CHK_MALLOC(pf, field_smob *, fields.num_items); - for (ifield = 0; ifield < fields.num_items; ++ifield) { - pf[ifield] = assert_field_smob(fields.items[ifield]); - CHECK(fields_conform(pf[0], pf[ifield]), - "fields for integrate-fields must conform"); - } - - if (fields.num_items > 0) { - n1 = pf[0]->nx; n2 = pf[0]->ny; n3 = pf[0]->nz; - n_other = pf[0]->other_dims; - n_last = pf[0]->last_dim_size - / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = pf[0]->last_dim; - } - else { - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - n_other = mdata->other_dims; - n_last = mdata->last_dim_size - / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = mdata->last_dim; - } - rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - /* Here we have different loops over the coordinates, depending - upon whether we are using complex or real and serial or - parallel transforms. Each loop must define, in its body, - variables (i2,j2,k2) describing the coordinate of the current - point, and "index" describing the corresponding index in - the curfield array. - - This was all stolen from maxwell_eps.c...it would be better - if we didn't have to cut and paste, sigh. */ + real s1, s2, s3, c1, c2, c3; + int ifield; + field_smob **pf; + cnumber integral = {0, 0}; + + CHK_MALLOC(pf, field_smob *, fields.num_items); + for (ifield = 0; ifield < fields.num_items; ++ifield) { + pf[ifield] = assert_field_smob(fields.items[ifield]); + CHECK(fields_conform(pf[0], pf[ifield]), "fields for integrate-fields must conform"); + } + + if (fields.num_items > 0) { + n1 = pf[0]->nx; + n2 = pf[0]->ny; + n3 = pf[0]->nz; + n_other = pf[0]->other_dims; + n_last = pf[0]->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = pf[0]->last_dim; + } + else { + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + n_other = mdata->other_dims; + n_last = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = mdata->last_dim; + } + rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + /* Here we have different loops over the coordinates, depending + upon whether we are using complex or real and serial or + parallel transforms. Each loop must define, in its body, + variables (i2,j2,k2) describing the coordinate of the current + point, and "index" describing the corresponding index in + the curfield array. + + This was all stolen from maxwell_eps.c...it would be better + if we didn't have to cut and paste, sigh. */ #ifdef SCALAR_COMPLEX -# ifndef HAVE_MPI +#ifndef HAVE_MPI - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int i2 = i, j2 = j, k2 = k; - int index = ((i * n2 + j) * n3 + k); + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int i2 = i, j2 = j, k2 = k; + int index = ((i * n2 + j) * n3 + k); -# else /* HAVE_MPI */ +#else /* HAVE_MPI */ - if (fields.num_items > 0) { - local_n2 = pf[0]->local_ny; - local_y_start = pf[0]->local_y_start; - } - else { - local_n2 = mdata->local_ny; - local_y_start = mdata->local_y_start; - } + if (fields.num_items > 0) { + local_n2 = pf[0]->local_ny; + local_y_start = pf[0]->local_y_start; + } + else { + local_n2 = mdata->local_ny; + local_y_start = mdata->local_y_start; + } - /* first two dimensions are transposed in MPI output: */ - for (j = 0; j < local_n2; ++j) - for (i = 0; i < n1; ++i) - for (k = 0; k < n3; ++k) - { - int i2 = i, j2 = j + local_y_start, k2 = k; - int index = ((j * n1 + i) * n3 + k); + /* first two dimensions are transposed in MPI output: */ + for (j = 0; j < local_n2; ++j) + for (i = 0; i < n1; ++i) + for (k = 0; k < n3; ++k) { + int i2 = i, j2 = j + local_y_start, k2 = k; + int index = ((j * n1 + i) * n3 + k); -# endif /* HAVE_MPI */ +#endif /* HAVE_MPI */ #else /* not SCALAR_COMPLEX */ -# ifndef HAVE_MPI - - for (i = 0; i < n_other; ++i) - for (j = 0; j < n_last; ++j) - { - int index = i * n_last + j; - int i2, j2, k2; - switch (rank) { - case 2: i2 = i; j2 = j; k2 = 0; break; - case 3: i2 = i / n2; j2 = i % n2; k2 = j; break; - default: i2 = j; j2 = k2 = 0; break; - } - -# else /* HAVE_MPI */ - - if (fields.num_items > 0) { - local_n2 = pf[0]->local_ny; - local_y_start = pf[0]->local_y_start; - } - else { - local_n2 = mdata->local_ny; - local_y_start = mdata->local_y_start; - } - - /* For a real->complex transform, the last dimension is cut in - half. For a 2d transform, this is taken into account in local_ny - already, but for a 3d transform we must compute the new n3: */ - if (n3 > 1) { - if (fields.num_items > 0) - local_n3 = pf[0]->last_dim_size / 2; - else - local_n3 = mdata->last_dim_size / 2; - } - else - local_n3 = 1; - - /* first two dimensions are transposed in MPI output: */ - for (j = 0; j < local_n2; ++j) - for (i = 0; i < n1; ++i) - for (k = 0; k < local_n3; ++k) - { -# define i2 i - int j2 = j + local_y_start; -# define k2 k - int index = ((j * n1 + i) * local_n3 + k); - -# endif /* HAVE_MPI */ +#ifndef HAVE_MPI + + for (i = 0; i < n_other; ++i) + for (j = 0; j < n_last; ++j) { + int index = i * n_last + j; + int i2, j2, k2; + switch (rank) { + case 2: + i2 = i; + j2 = j; + k2 = 0; + break; + case 3: + i2 = i / n2; + j2 = i % n2; + k2 = j; + break; + default: + i2 = j; + j2 = k2 = 0; + break; + } + +#else /* HAVE_MPI */ + + if (fields.num_items > 0) { + local_n2 = pf[0]->local_ny; + local_y_start = pf[0]->local_y_start; + } + else { + local_n2 = mdata->local_ny; + local_y_start = mdata->local_y_start; + } + + /* For a real->complex transform, the last dimension is cut in + half. For a 2d transform, this is taken into account in local_ny + already, but for a 3d transform we must compute the new n3: */ + if (n3 > 1) { + if (fields.num_items > 0) + local_n3 = pf[0]->last_dim_size / 2; + else + local_n3 = mdata->last_dim_size / 2; + } + else + local_n3 = 1; + + /* first two dimensions are transposed in MPI output: */ + for (j = 0; j < local_n2; ++j) + for (i = 0; i < n1; ++i) + for (k = 0; k < local_n3; ++k) { +#define i2 i + int j2 = j + local_y_start; +#define k2 k + int index = ((j * n1 + i) * local_n3 + k); + +#endif /* HAVE_MPI */ #endif /* not SCALAR_COMPLEX */ - { - list arg_list = SCM_EOL; - cnumber integrand; - vector3 p; - - p.x = i2 * s1 - c1; p.y = j2 * s2 - c2; p.z = k2 * s3 - c3; - - for (ifield = fields.num_items - 1; ifield >= 0; --ifield) { - SCM item = SCM_EOL; - switch (pf[ifield]->type) { - case RSCALAR_FIELD_SMOB: - item = ctl_convert_number_to_scm(pf[ifield]->f.rs[index]); - break; - case CSCALAR_FIELD_SMOB: - item = cnumber2scm(cscalar2cnumber( - pf[ifield]->f.cs[index])); - break; - case CVECTOR_FIELD_SMOB: - item = cvector32scm(cscalar32cvector3( - pf[ifield]->f.cv+3*index)); - break; - } - arg_list = gh_cons(item, arg_list); - } - arg_list = gh_cons(vector32scm(p), arg_list); - integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); - integral.re += integrand.re; - integral.im += integrand.im; + { + list arg_list = SCM_EOL; + cnumber integrand; + vector3 p; + + p.x = i2 * s1 - c1; + p.y = j2 * s2 - c2; + p.z = k2 * s3 - c3; + + for (ifield = fields.num_items - 1; ifield >= 0; --ifield) { + SCM item = SCM_EOL; + switch (pf[ifield]->type) { + case RSCALAR_FIELD_SMOB: + item = ctl_convert_number_to_scm(pf[ifield]->f.rs[index]); + break; + case CSCALAR_FIELD_SMOB: + item = cnumber2scm(cscalar2cnumber(pf[ifield]->f.cs[index])); + break; + case CVECTOR_FIELD_SMOB: + item = cvector32scm(cscalar32cvector3(pf[ifield]->f.cv + 3 * index)); + break; + } + arg_list = gh_cons(item, arg_list); + } + arg_list = gh_cons(vector32scm(p), arg_list); + integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); + integral.re += integrand.re; + integral.im += integrand.im; #ifndef SCALAR_COMPLEX - { - int last_index; -# ifdef HAVE_MPI - if (n3 == 1) - last_index = j + local_y_start; - else - last_index = k; -# else - last_index = j; -# endif - - if (last_index != 0 && 2*last_index != last_dim) { - int i2c, j2c, k2c; - i2c = i2 ? (n1 - i2) : 0; - j2c = j2 ? (n2 - j2) : 0; - k2c = k2 ? (n3 - k2) : 0; - p.x = i2c * s1 - c1; - p.y = j2c * s2 - c2; - p.z = k2c * s3 - c3; - arg_list = SCM_EOL; - for (ifield = fields.num_items - 1; - ifield >= 0; --ifield) { - SCM item = SCM_UNDEFINED; - switch (pf[ifield]->type) { - case RSCALAR_FIELD_SMOB: - item = ctl_convert_number_to_scm( - pf[ifield]->f.rs[index]); - break; - case CSCALAR_FIELD_SMOB: - item = cnumber2scm(cscalar2cnumber( - pf[ifield]->f.cs[index])); - break; - case CVECTOR_FIELD_SMOB: - item = cvector32scm( - cvector3_conj(cscalar32cvector3( - pf[ifield]->f.cv+3*index))); - break; - } - arg_list = gh_cons(item, arg_list); - } - arg_list = gh_cons(vector32scm(p), arg_list); - integrand = - ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); - integral.re += integrand.re; - integral.im += integrand.im; - } - } + { + int last_index; +#ifdef HAVE_MPI + if (n3 == 1) + last_index = j + local_y_start; + else + last_index = k; +#else + last_index = j; +#endif + + if (last_index != 0 && 2 * last_index != last_dim) { + int i2c, j2c, k2c; + i2c = i2 ? (n1 - i2) : 0; + j2c = j2 ? (n2 - j2) : 0; + k2c = k2 ? (n3 - k2) : 0; + p.x = i2c * s1 - c1; + p.y = j2c * s2 - c2; + p.z = k2c * s3 - c3; + arg_list = SCM_EOL; + for (ifield = fields.num_items - 1; ifield >= 0; --ifield) { + SCM item = SCM_UNDEFINED; + switch (pf[ifield]->type) { + case RSCALAR_FIELD_SMOB: + item = ctl_convert_number_to_scm(pf[ifield]->f.rs[index]); + break; + case CSCALAR_FIELD_SMOB: + item = cnumber2scm(cscalar2cnumber(pf[ifield]->f.cs[index])); + break; + case CVECTOR_FIELD_SMOB: + item = cvector32scm( + cvector3_conj(cscalar32cvector3(pf[ifield]->f.cv + 3 * index))); + break; + } + arg_list = gh_cons(item, arg_list); + } + arg_list = gh_cons(vector32scm(p), arg_list); + integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); + integral.re += integrand.re; + integral.im += integrand.im; + } + } #endif - } - } - - free(pf); - - integral.re *= Vol / (n1 * n2 * n3); - integral.im *= Vol / (n1 * n2 * n3); - { - cnumber integral_sum; - mpi_allreduce(&integral, &integral_sum, 2, number, - MPI_DOUBLE, MPI_SUM, mpb_comm); - return integral_sum; - } + } + } + + free(pf); + + integral.re *= Vol / (n1 * n2 * n3); + integral.im *= Vol / (n1 * n2 * n3); + { + cnumber integral_sum; + mpi_allreduce(&integral, &integral_sum, 2, number, MPI_DOUBLE, MPI_SUM, mpb_comm); + return integral_sum; + } } diff --git a/mpb/field-smob.h b/mpb/field-smob.h index 61d7a330..619fa1d8 100644 --- a/mpb/field-smob.h +++ b/mpb/field-smob.h @@ -25,21 +25,19 @@ extern long scm_tc16_smob_field_smob; -typedef enum { - RSCALAR_FIELD_SMOB, CVECTOR_FIELD_SMOB, CSCALAR_FIELD_SMOB -} field_smob_type; +typedef enum { RSCALAR_FIELD_SMOB, CVECTOR_FIELD_SMOB, CSCALAR_FIELD_SMOB } field_smob_type; typedef struct { - field_smob_type type; - char type_char; - union { - real *rs; - scalar_complex *cv; - scalar_complex *cs; - } f; - int nx, ny, nz, N; - int local_ny, local_y_start; - int last_dim, last_dim_size, other_dims; + field_smob_type type; + char type_char; + union { + real *rs; + scalar_complex *cv; + scalar_complex *cs; + } f; + int nx, ny, nz, N; + int local_ny, local_y_start; + int last_dim, last_dim_size, other_dims; } field_smob; #define FIELD_P(X) T_SMOB_P(field_smob, X) diff --git a/mpb/fields.c b/mpb/fields.c index bc6c2359..7ea12442 100644 --- a/mpb/fields.c +++ b/mpb/fields.c @@ -47,319 +47,295 @@ All of these functions are designed to be called by the user via Guile. */ -void get_dfield(int which_band) -{ - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before get-dfield!\n"); - return; - } - if (!kpoint_index) { - mpi_one_fprintf(stderr, - "solve-kpoint must be called before get-dfield!\n"); - return; - } - if (which_band < 1 || which_band > H.p) { - mpi_one_fprintf(stderr, - "must have 1 <= band index <= num_bands (%d)\n",H.p); - return; - } - - curfield = (scalar_complex *) mdata->fft_data; - curfield_band = which_band; - curfield_type = 'd'; - if (mdata->mu_inv == NULL) - maxwell_compute_d_from_H(mdata, H, curfield, which_band - 1, 1); - else { - evectmatrix_resize(&W[0], 1, 0); - maxwell_compute_H_from_B(mdata, H, W[0], curfield, which_band-1,0, 1); - maxwell_compute_d_from_H(mdata, W[0], curfield, 0, 1); - evectmatrix_resize(&W[0], W[0].alloc_p, 0); - } - - /* Here, we correct for the fact that compute_d_from_H actually - computes just (k+G) x H, whereas the actual D field is - i/omega i(k+G) x H...so, there is an added factor of -1/omega. - - We also divide by the cell volume so that the integral of H*B - or of D*E is unity. (From the eigensolver + FFT, they are - initially normalized to sum to nx*ny*nz.) */ - { - int i, N; - double scale; - N = mdata->fft_output_size; - - if (freqs.items[which_band - 1] != 0.0) { - scale = -1.0 / freqs.items[which_band - 1]; - } - else - scale = -1.0; /* arbitrary */ - - scale /= sqrt(Vol); - - for (i = 0; i < 3*N; ++i) { - curfield[i].re *= scale; - curfield[i].im *= scale; - } - } +void get_dfield(int which_band) { + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before get-dfield!\n"); + return; + } + if (!kpoint_index) { + mpi_one_fprintf(stderr, "solve-kpoint must be called before get-dfield!\n"); + return; + } + if (which_band < 1 || which_band > H.p) { + mpi_one_fprintf(stderr, "must have 1 <= band index <= num_bands (%d)\n", H.p); + return; + } + + curfield = (scalar_complex *)mdata->fft_data; + curfield_band = which_band; + curfield_type = 'd'; + if (mdata->mu_inv == NULL) + maxwell_compute_d_from_H(mdata, H, curfield, which_band - 1, 1); + else { + evectmatrix_resize(&W[0], 1, 0); + maxwell_compute_H_from_B(mdata, H, W[0], curfield, which_band - 1, 0, 1); + maxwell_compute_d_from_H(mdata, W[0], curfield, 0, 1); + evectmatrix_resize(&W[0], W[0].alloc_p, 0); + } + + /* Here, we correct for the fact that compute_d_from_H actually + computes just (k+G) x H, whereas the actual D field is + i/omega i(k+G) x H...so, there is an added factor of -1/omega. + + We also divide by the cell volume so that the integral of H*B + or of D*E is unity. (From the eigensolver + FFT, they are + initially normalized to sum to nx*ny*nz.) */ + { + int i, N; + double scale; + N = mdata->fft_output_size; + + if (freqs.items[which_band - 1] != 0.0) { scale = -1.0 / freqs.items[which_band - 1]; } + else + scale = -1.0; /* arbitrary */ + + scale /= sqrt(Vol); + + for (i = 0; i < 3 * N; ++i) { + curfield[i].re *= scale; + curfield[i].im *= scale; + } + } } -void get_hfield(integer which_band) -{ - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before get-hfield!\n"); - return; - } - if (!kpoint_index) { - mpi_one_fprintf(stderr, - "solve-kpoint must be called before get-hfield!\n"); - return; - } - if (which_band < 1 || which_band > H.p) { - mpi_one_fprintf(stderr, - "must have 1 <= band index <= num_bands (%d)\n",H.p); - return; - } - - curfield = (scalar_complex *) mdata->fft_data; - curfield_band = which_band; - curfield_type = 'h'; - if (mdata->mu_inv == NULL) - maxwell_compute_h_from_H(mdata, H, curfield, which_band - 1, 1); - else { - evectmatrix_resize(&W[0], 1, 0); - maxwell_compute_H_from_B(mdata, H, W[0], curfield, which_band-1,0, 1); - maxwell_compute_h_from_H(mdata, W[0], curfield, 0, 1); - evectmatrix_resize(&W[0], W[0].alloc_p, 0); - } - - /* Divide by the cell volume so that the integral of H*B - or of D*E is unity. (From the eigensolver + FFT, they are - initially normalized to sum to nx*ny*nz.) */ - { - int i, N; - double scale; - N = mdata->fft_output_size; - - scale = 1.0 / sqrt(Vol); - for (i = 0; i < 3*N; ++i) { - curfield[i].re *= scale; - curfield[i].im *= scale; - } - } +void get_hfield(integer which_band) { + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before get-hfield!\n"); + return; + } + if (!kpoint_index) { + mpi_one_fprintf(stderr, "solve-kpoint must be called before get-hfield!\n"); + return; + } + if (which_band < 1 || which_band > H.p) { + mpi_one_fprintf(stderr, "must have 1 <= band index <= num_bands (%d)\n", H.p); + return; + } + + curfield = (scalar_complex *)mdata->fft_data; + curfield_band = which_band; + curfield_type = 'h'; + if (mdata->mu_inv == NULL) + maxwell_compute_h_from_H(mdata, H, curfield, which_band - 1, 1); + else { + evectmatrix_resize(&W[0], 1, 0); + maxwell_compute_H_from_B(mdata, H, W[0], curfield, which_band - 1, 0, 1); + maxwell_compute_h_from_H(mdata, W[0], curfield, 0, 1); + evectmatrix_resize(&W[0], W[0].alloc_p, 0); + } + + /* Divide by the cell volume so that the integral of H*B + or of D*E is unity. (From the eigensolver + FFT, they are + initially normalized to sum to nx*ny*nz.) */ + { + int i, N; + double scale; + N = mdata->fft_output_size; + + scale = 1.0 / sqrt(Vol); + for (i = 0; i < 3 * N; ++i) { + curfield[i].re *= scale; + curfield[i].im *= scale; + } + } } -void get_bfield(integer which_band) -{ - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before get-bfield!\n"); - return; - } - if (!kpoint_index) { - mpi_one_fprintf(stderr, - "solve-kpoint must be called before get-bfield!\n"); - return; - } - if (which_band < 1 || which_band > H.p) { - mpi_one_fprintf(stderr, - "must have 1 <= band index <= num_bands (%d)\n",H.p); - return; - } - - curfield = (scalar_complex *) mdata->fft_data; - curfield_band = which_band; - curfield_type = 'b'; - maxwell_compute_h_from_H(mdata, H, curfield, which_band - 1, 1); - - /* Divide by the cell volume so that the integral of H*B - or of D*E is unity. (From the eigensolver + FFT, they are - initially normalized to sum to nx*ny*nz.) */ - { - int i, N; - double scale; - N = mdata->fft_output_size; - - scale = 1.0 / sqrt(Vol); - for (i = 0; i < 3*N; ++i) { - curfield[i].re *= scale; - curfield[i].im *= scale; - } - } +void get_bfield(integer which_band) { + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before get-bfield!\n"); + return; + } + if (!kpoint_index) { + mpi_one_fprintf(stderr, "solve-kpoint must be called before get-bfield!\n"); + return; + } + if (which_band < 1 || which_band > H.p) { + mpi_one_fprintf(stderr, "must have 1 <= band index <= num_bands (%d)\n", H.p); + return; + } + + curfield = (scalar_complex *)mdata->fft_data; + curfield_band = which_band; + curfield_type = 'b'; + maxwell_compute_h_from_H(mdata, H, curfield, which_band - 1, 1); + + /* Divide by the cell volume so that the integral of H*B + or of D*E is unity. (From the eigensolver + FFT, they are + initially normalized to sum to nx*ny*nz.) */ + { + int i, N; + double scale; + N = mdata->fft_output_size; + + scale = 1.0 / sqrt(Vol); + for (i = 0; i < 3 * N; ++i) { + curfield[i].re *= scale; + curfield[i].im *= scale; + } + } } -void get_efield_from_dfield(void) -{ - if (!curfield || curfield_type != 'd') { - mpi_one_fprintf(stderr, "get-dfield must be called before " - "get-efield-from-dfield!\n"); - return; - } - CHECK(mdata, "unexpected NULL mdata"); - maxwell_compute_e_from_d(mdata, curfield, 1); - curfield_type = 'e'; +void get_efield_from_dfield(void) { + if (!curfield || curfield_type != 'd') { + mpi_one_fprintf(stderr, "get-dfield must be called before " + "get-efield-from-dfield!\n"); + return; + } + CHECK(mdata, "unexpected NULL mdata"); + maxwell_compute_e_from_d(mdata, curfield, 1); + curfield_type = 'e'; } -void get_efield(integer which_band) -{ - get_dfield(which_band); - get_efield_from_dfield(); +void get_efield(integer which_band) { + get_dfield(which_band); + get_efield_from_dfield(); } /* Extract the mean epsilon from the effective inverse dielectric tensor, which contains two eigenvalues that correspond to the mean epsilon, and one which corresponds to the harmonic mean. */ -real mean_medium_from_matrix(const symmetric_matrix *eps_inv) -{ - real eps_eigs[3]; - maxwell_sym_matrix_eigs(eps_eigs, eps_inv); - /* the harmonic mean should be the largest eigenvalue (smallest - epsilon), so we'll ignore it and average the other two: */ - return 2.0 / (eps_eigs[0] + eps_eigs[1]); +real mean_medium_from_matrix(const symmetric_matrix *eps_inv) { + real eps_eigs[3]; + maxwell_sym_matrix_eigs(eps_eigs, eps_inv); + /* the harmonic mean should be the largest eigenvalue (smallest + epsilon), so we'll ignore it and average the other two: */ + return 2.0 / (eps_eigs[0] + eps_eigs[1]); } /**************************************************************************/ /* internal function for compute_field_energy, below */ -double compute_field_energy_internal(real comp_sum[6]) -{ - int i, N, last_dim, last_dim_stored, nx, nz, local_y_start; - real comp_sum2[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - real energy_sum = 0.0; - real *energy_density = (real *) curfield; - - N = mdata->fft_output_size; - last_dim = mdata->last_dim; - last_dim_stored = - mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - nx = mdata->nx; nz = mdata->nz; local_y_start = mdata->local_y_start; - - for (i = 0; i < N; ++i) { - scalar_complex field[3]; - real - comp_sqr0,comp_sqr1,comp_sqr2,comp_sqr3,comp_sqr4,comp_sqr5; - - /* energy is either |curfield|^2 / mu or |curfield|^2 / epsilon, - depending upon whether it is B or D. */ - if (curfield_type == 'd') - assign_symmatrix_vector(field, mdata->eps_inv[i], curfield+3*i); - else if (curfield_type == 'b' && mdata->mu_inv != NULL) - assign_symmatrix_vector(field, mdata->mu_inv[i], curfield+3*i); - else { - field[0] = curfield[3*i]; - field[1] = curfield[3*i+1]; - field[2] = curfield[3*i+2]; - } - - comp_sum2[0] += comp_sqr0 = field[0].re * curfield[3*i].re; - comp_sum2[1] += comp_sqr1 = field[0].im * curfield[3*i].im; - comp_sum2[2] += comp_sqr2 = field[1].re * curfield[3*i+1].re; - comp_sum2[3] += comp_sqr3 = field[1].im * curfield[3*i+1].im; - comp_sum2[4] += comp_sqr4 = field[2].re * curfield[3*i+2].re; - comp_sum2[5] += comp_sqr5 = field[2].im * curfield[3*i+2].im; - - /* Note: here, we write to energy_density[i]; this is - safe, even though energy_density is aliased to curfield, - since energy_density[i] is guaranteed to come at or before - curfield[i] (which we are now done with). */ - - energy_sum += energy_density[i] = - comp_sqr0+comp_sqr1+comp_sqr2+comp_sqr3+comp_sqr4+comp_sqr5; +double compute_field_energy_internal(real comp_sum[6]) { + int i, N, last_dim, last_dim_stored, nx, nz, local_y_start; + real comp_sum2[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + real energy_sum = 0.0; + real *energy_density = (real *)curfield; + + N = mdata->fft_output_size; + last_dim = mdata->last_dim; + last_dim_stored = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + nx = mdata->nx; + nz = mdata->nz; + local_y_start = mdata->local_y_start; + + for (i = 0; i < N; ++i) { + scalar_complex field[3]; + real comp_sqr0, comp_sqr1, comp_sqr2, comp_sqr3, comp_sqr4, comp_sqr5; + + /* energy is either |curfield|^2 / mu or |curfield|^2 / epsilon, + depending upon whether it is B or D. */ + if (curfield_type == 'd') + assign_symmatrix_vector(field, mdata->eps_inv[i], curfield + 3 * i); + else if (curfield_type == 'b' && mdata->mu_inv != NULL) + assign_symmatrix_vector(field, mdata->mu_inv[i], curfield + 3 * i); + else { + field[0] = curfield[3 * i]; + field[1] = curfield[3 * i + 1]; + field[2] = curfield[3 * i + 2]; + } + + comp_sum2[0] += comp_sqr0 = field[0].re * curfield[3 * i].re; + comp_sum2[1] += comp_sqr1 = field[0].im * curfield[3 * i].im; + comp_sum2[2] += comp_sqr2 = field[1].re * curfield[3 * i + 1].re; + comp_sum2[3] += comp_sqr3 = field[1].im * curfield[3 * i + 1].im; + comp_sum2[4] += comp_sqr4 = field[2].re * curfield[3 * i + 2].re; + comp_sum2[5] += comp_sqr5 = field[2].im * curfield[3 * i + 2].im; + + /* Note: here, we write to energy_density[i]; this is + safe, even though energy_density is aliased to curfield, + since energy_density[i] is guaranteed to come at or before + curfield[i] (which we are now done with). */ + + energy_sum += energy_density[i] = + comp_sqr0 + comp_sqr1 + comp_sqr2 + comp_sqr3 + comp_sqr4 + comp_sqr5; #ifndef SCALAR_COMPLEX - /* most points need to be counted twice, by rfftw output symmetry: */ - { - int last_index; -# ifdef HAVE_MPI - if (nz == 1) /* 2d calculation: 1st dim. is truncated one */ - last_index = i / nx + local_y_start; - else - last_index = i % last_dim_stored; -# else - last_index = i % last_dim_stored; -# endif - if (last_index != 0 && 2*last_index != last_dim) { - energy_sum += energy_density[i]; - comp_sum2[0] += comp_sqr0; - comp_sum2[1] += comp_sqr1; - comp_sum2[2] += comp_sqr2; - comp_sum2[3] += comp_sqr3; - comp_sum2[4] += comp_sqr4; - comp_sum2[5] += comp_sqr5; - } - } + /* most points need to be counted twice, by rfftw output symmetry: */ + { + int last_index; +#ifdef HAVE_MPI + if (nz == 1) /* 2d calculation: 1st dim. is truncated one */ + last_index = i / nx + local_y_start; + else + last_index = i % last_dim_stored; +#else + last_index = i % last_dim_stored; #endif - } + if (last_index != 0 && 2 * last_index != last_dim) { + energy_sum += energy_density[i]; + comp_sum2[0] += comp_sqr0; + comp_sum2[1] += comp_sqr1; + comp_sum2[2] += comp_sqr2; + comp_sum2[3] += comp_sqr3; + comp_sum2[4] += comp_sqr4; + comp_sum2[5] += comp_sqr5; + } + } +#endif + } - mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - mpi_allreduce(comp_sum2, comp_sum, 6, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); + mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + mpi_allreduce(comp_sum2, comp_sum, 6, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - /* remember that we now have energy density; denoted by capital D/H */ - curfield_type = toupper(curfield_type); + /* remember that we now have energy density; denoted by capital D/H */ + curfield_type = toupper(curfield_type); - return energy_sum; + return energy_sum; } /* Replace curfield (either d or h) with the scalar energy density function, normalized to one. While we're at it, compute some statistics about the relative strength of different field components. Also return the integral of the energy density, which should be unity. */ -number_list compute_field_energy(void) -{ - int i; - real energy_sum, comp_sum[6]; - number_list retval = { 0, 0 }; - - if (!curfield || !strchr("dhb", curfield_type)) { - mpi_one_fprintf(stderr, "The D or H field must be loaded first.\n"); - return retval; - } - else if (curfield_type == 'h' && mdata->mu_inv != NULL) { - mpi_one_fprintf(stderr, "B, not H, must be loaded if we have mu.\n"); - return retval; - } - - energy_sum = compute_field_energy_internal(comp_sum); - - mpi_one_printf("%c-energy-components:, %d, %d", - curfield_type, kpoint_index, curfield_band); - for (i = 0; i < 6; ++i) { - comp_sum[i] /= (energy_sum == 0 ? 1 : energy_sum); - if (i % 2 == 1) - mpi_one_printf(", %g", comp_sum[i] + comp_sum[i-1]); - } - mpi_one_printf("\n"); - - /* The return value is a list of 7 items: the total energy, - followed by the 6 elements of the comp_sum array (the fraction - of the energy in the real/imag. parts of each field component). */ - - retval.num_items = 7; - CHK_MALLOC(retval.items, number, retval.num_items); - - retval.items[0] = energy_sum * Vol / H.N; - - for (i = 0; i < 6; ++i) - retval.items[i+1] = comp_sum[i]; - - return retval; +number_list compute_field_energy(void) { + int i; + real energy_sum, comp_sum[6]; + number_list retval = {0, 0}; + + if (!curfield || !strchr("dhb", curfield_type)) { + mpi_one_fprintf(stderr, "The D or H field must be loaded first.\n"); + return retval; + } + else if (curfield_type == 'h' && mdata->mu_inv != NULL) { + mpi_one_fprintf(stderr, "B, not H, must be loaded if we have mu.\n"); + return retval; + } + + energy_sum = compute_field_energy_internal(comp_sum); + + mpi_one_printf("%c-energy-components:, %d, %d", curfield_type, kpoint_index, curfield_band); + for (i = 0; i < 6; ++i) { + comp_sum[i] /= (energy_sum == 0 ? 1 : energy_sum); + if (i % 2 == 1) mpi_one_printf(", %g", comp_sum[i] + comp_sum[i - 1]); + } + mpi_one_printf("\n"); + + /* The return value is a list of 7 items: the total energy, + followed by the 6 elements of the comp_sum array (the fraction + of the energy in the real/imag. parts of each field component). */ + + retval.num_items = 7; + CHK_MALLOC(retval.items, number, retval.num_items); + + retval.items[0] = energy_sum * Vol / H.N; + + for (i = 0; i < 6; ++i) + retval.items[i + 1] = comp_sum[i]; + + return retval; } /* compute |F|^2 for the current field, whether or not this is an energy density */ -void compute_field_squared(void) -{ - real comp_sum[6]; /* unused */ +void compute_field_squared(void) { + real comp_sum[6]; /* unused */ - if (!curfield || !strchr("dhbecv", curfield_type)) { - mpi_one_fprintf(stderr, "A vector field must be loaded first.\n"); - } + if (!curfield || !strchr("dhbecv", curfield_type)) { + mpi_one_fprintf(stderr, "A vector field must be loaded first.\n"); + } - curfield_type = 'c'; /* force it to just square the field */ - compute_field_energy_internal(comp_sum); - curfield_type = 'R'; /* generic real scalar field */ + curfield_type = 'c'; /* force it to just square the field */ + compute_field_energy_internal(comp_sum); + curfield_type = 'R'; /* generic real scalar field */ } /**************************************************************************/ @@ -367,53 +343,50 @@ void compute_field_squared(void) /* replace the current field with its scalar divergence; only works for Bloch fields */ -void compute_field_divergence(void) -{ - int i, j, N; - scalar *field = (scalar *) curfield; - scalar *field2 = mdata->fft_data == mdata->fft_data2 ? field : (field == mdata->fft_data ? mdata->fft_data2 : mdata->fft_data); - real scale; - - if (!curfield || !strchr("dhbec", curfield_type)) { - mpi_one_fprintf(stderr, "A Bloch-periodic field must be loaded.\n"); - return; - } - - /* convert back to Fourier space */ - maxwell_compute_fft(-1, mdata, field, field2, 3, 3, 1); - - /* compute (k+G) dot field */ - for (i = 0; i < mdata->other_dims; ++i) - for (j = 0; j < mdata->last_dim; ++j) { - int ij = i * mdata->last_dim_size + j; - k_data cur_k = mdata->k_plus_G[ij]; - /* k+G = |k+G| (m x n) */ - real kx = cur_k.kmag * (cur_k.my*cur_k.nz-cur_k.mz*cur_k.ny); - real ky = cur_k.kmag * (cur_k.mz*cur_k.nx-cur_k.mx*cur_k.nz); - real kz = cur_k.kmag * (cur_k.mx*cur_k.ny-cur_k.my*cur_k.nz); - ASSIGN_SCALAR(field2[ij], - SCALAR_RE(field2[3*ij+0]) * kx + - SCALAR_RE(field2[3*ij+1]) * ky + - SCALAR_RE(field2[3*ij+2]) * kz, - SCALAR_IM(field2[3*ij+0]) * kx + - SCALAR_IM(field2[3*ij+1]) * ky + - SCALAR_IM(field2[3*ij+2]) * kz); - } - - /* convert scalar field back to position space */ - maxwell_compute_fft(+1, mdata, field2, field, 1, 1, 1); - - /* multiply by i (from divergence) and normalization (from FFT) - and 2*pi (from k+G) */ - scale = TWOPI / H.N; - N = mdata->fft_output_size; - for (i = 0; i < N; ++i) { - CASSIGN_SCALAR(curfield[i], - -CSCALAR_IM(curfield[i]) * scale, - CSCALAR_RE(curfield[i]) * scale); - } - - curfield_type = 'C'; /* complex (Bloch) scalar field */ +void compute_field_divergence(void) { + int i, j, N; + scalar *field = (scalar *)curfield; + scalar *field2 = mdata->fft_data == mdata->fft_data2 + ? field + : (field == mdata->fft_data ? mdata->fft_data2 : mdata->fft_data); + real scale; + + if (!curfield || !strchr("dhbec", curfield_type)) { + mpi_one_fprintf(stderr, "A Bloch-periodic field must be loaded.\n"); + return; + } + + /* convert back to Fourier space */ + maxwell_compute_fft(-1, mdata, field, field2, 3, 3, 1); + + /* compute (k+G) dot field */ + for (i = 0; i < mdata->other_dims; ++i) + for (j = 0; j < mdata->last_dim; ++j) { + int ij = i * mdata->last_dim_size + j; + k_data cur_k = mdata->k_plus_G[ij]; + /* k+G = |k+G| (m x n) */ + real kx = cur_k.kmag * (cur_k.my * cur_k.nz - cur_k.mz * cur_k.ny); + real ky = cur_k.kmag * (cur_k.mz * cur_k.nx - cur_k.mx * cur_k.nz); + real kz = cur_k.kmag * (cur_k.mx * cur_k.ny - cur_k.my * cur_k.nz); + ASSIGN_SCALAR(field2[ij], + SCALAR_RE(field2[3 * ij + 0]) * kx + SCALAR_RE(field2[3 * ij + 1]) * ky + + SCALAR_RE(field2[3 * ij + 2]) * kz, + SCALAR_IM(field2[3 * ij + 0]) * kx + SCALAR_IM(field2[3 * ij + 1]) * ky + + SCALAR_IM(field2[3 * ij + 2]) * kz); + } + + /* convert scalar field back to position space */ + maxwell_compute_fft(+1, mdata, field2, field, 1, 1, 1); + + /* multiply by i (from divergence) and normalization (from FFT) + and 2*pi (from k+G) */ + scale = TWOPI / H.N; + N = mdata->fft_output_size; + for (i = 0; i < N; ++i) { + CASSIGN_SCALAR(curfield[i], -CSCALAR_IM(curfield[i]) * scale, CSCALAR_RE(curfield[i]) * scale); + } + + curfield_type = 'C'; /* complex (Bloch) scalar field */ } /**************************************************************************/ @@ -436,110 +409,110 @@ void compute_field_divergence(void) is already fixed, to within a sign, by the choice to make the Fourier transform purely real. So, in that case we simply pick a sign, in a manner similar to (2) and (3) above. */ -void fix_field_phase(void) -{ - int i, N; - real sq_sum2[2] = {0,0}, sq_sum[2], maxabs = 0.0; - int maxabs_index = 0, maxabs_sign = 1; - double theta; - scalar phase; - - if (!curfield || !strchr("dhbecv", curfield_type)) { - mpi_one_fprintf(stderr, "The D/H/E field must be loaded first.\n"); - return; - } - N = mdata->fft_output_size * 3; +void fix_field_phase(void) { + int i, N; + real sq_sum2[2] = {0, 0}, sq_sum[2], maxabs = 0.0; + int maxabs_index = 0, maxabs_sign = 1; + double theta; + scalar phase; + + if (!curfield || !strchr("dhbecv", curfield_type)) { + mpi_one_fprintf(stderr, "The D/H/E field must be loaded first.\n"); + return; + } + N = mdata->fft_output_size * 3; #ifdef SCALAR_COMPLEX - /* Compute the phase that maximizes the sum of the squares of - the real parts of the components. Equivalently, maximize - the real part of the sum of the squares. */ - for (i = 0; i < N; ++i) { - real a,b; - a = curfield[i].re; b = curfield[i].im; - sq_sum2[0] += a*a - b*b; - sq_sum2[1] += 2*a*b; - } - mpi_allreduce(sq_sum2, sq_sum, 2, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - /* compute the phase = exp(i*theta) maximizing the real part of - the sum of the squares. i.e., maximize: - cos(2*theta)*sq_sum[0] - sin(2*theta)*sq_sum[1] */ - theta = 0.5 * atan2(-sq_sum[1], sq_sum[0]); - phase.re = cos(theta); - phase.im = sin(theta); -#else /* ! SCALAR_COMPLEX */ - phase = 1; + /* Compute the phase that maximizes the sum of the squares of + the real parts of the components. Equivalently, maximize + the real part of the sum of the squares. */ + for (i = 0; i < N; ++i) { + real a, b; + a = curfield[i].re; + b = curfield[i].im; + sq_sum2[0] += a * a - b * b; + sq_sum2[1] += 2 * a * b; + } + mpi_allreduce(sq_sum2, sq_sum, 2, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + /* compute the phase = exp(i*theta) maximizing the real part of + the sum of the squares. i.e., maximize: + cos(2*theta)*sq_sum[0] - sin(2*theta)*sq_sum[1] */ + theta = 0.5 * atan2(-sq_sum[1], sq_sum[0]); + phase.re = cos(theta); + phase.im = sin(theta); +#else /* ! SCALAR_COMPLEX */ + phase = 1; #endif /* ! SCALAR_COMPLEX */ - /* Next, fix the overall sign. We do this by first computing the - maximum |real part| of the jmax component (after multiplying - by phase), and then finding the last spatial index at which - |real part| is at least half of this value. The sign is then - chosen to make the real part positive at that point. + /* Next, fix the overall sign. We do this by first computing the + maximum |real part| of the jmax component (after multiplying + by phase), and then finding the last spatial index at which + |real part| is at least half of this value. The sign is then + chosen to make the real part positive at that point. - (Note that we can't just make the point of maximum |real part| - positive, as that would be ambiguous in the common case of an - oscillating field within the unit cell.) + (Note that we can't just make the point of maximum |real part| + positive, as that would be ambiguous in the common case of an + oscillating field within the unit cell.) - In the case of inversion symmetry (!SCALAR_COMPLEX), we work with - (real part - imag part) instead of (real part), to insure that we - have something that is nonzero somewhere. */ + In the case of inversion symmetry (!SCALAR_COMPLEX), we work with + (real part - imag part) instead of (real part), to insure that we + have something that is nonzero somewhere. */ - for (i = 0; i < N; ++i) { + for (i = 0; i < N; ++i) { #ifdef SCALAR_COMPLEX - real r = fabs(curfield[i].re * phase.re - curfield[i].im * phase.im); + real r = fabs(curfield[i].re * phase.re - curfield[i].im * phase.im); #else - real r = fabs(curfield[i].re - curfield[i].im); + real r = fabs(curfield[i].re - curfield[i].im); #endif - if (r > maxabs) - maxabs = r; - } - mpi_allreduce_1(&maxabs, real, SCALAR_MPI_TYPE, - MPI_MAX, mpb_comm); - for (i = N - 1; i >= 0; --i) { + if (r > maxabs) maxabs = r; + } + mpi_allreduce_1(&maxabs, real, SCALAR_MPI_TYPE, MPI_MAX, mpb_comm); + for (i = N - 1; i >= 0; --i) { #ifdef SCALAR_COMPLEX - real r = curfield[i].re * phase.re - curfield[i].im * phase.im; + real r = curfield[i].re * phase.re - curfield[i].im * phase.im; #else - real r = curfield[i].re - curfield[i].im; + real r = curfield[i].re - curfield[i].im; #endif - if (fabs(r) >= 0.5 * maxabs) { - maxabs_index = i; - maxabs_sign = r < 0 ? -1 : 1; - break; - } - } - if (i >= 0) /* convert index to global index in distributed array: */ - maxabs_index += mdata->local_y_start * mdata->nx * mdata->nz; - { - /* compute maximum index and corresponding sign over all the - processors, using the MPI_MAXLOC reduction operation: */ - struct twoint_struct {int i; int s;} x; - x.i = maxabs_index; x.s = maxabs_sign; - mpi_allreduce_1(&x, struct twoint_struct, MPI_2INT, - MPI_MAXLOC, mpb_comm); - maxabs_index = x.i; maxabs_sign = x.s; - } - ASSIGN_SCALAR(phase, - SCALAR_RE(phase)*maxabs_sign, SCALAR_IM(phase)*maxabs_sign); - - mpi_one_printf("Fixing %c-field (band %d) phase by %g + %gi; " - "max ampl. = %g\n", curfield_type, curfield_band, - SCALAR_RE(phase), SCALAR_IM(phase), maxabs); - - /* Now, multiply everything by this phase, *including* the - stored "raw" eigenvector in H, so that any future fields - that we compute will have a consistent phase: */ - for (i = 0; i < N; ++i) { - real a,b; - a = curfield[i].re; b = curfield[i].im; - curfield[i].re = a*SCALAR_RE(phase) - b*SCALAR_IM(phase); - curfield[i].im = a*SCALAR_IM(phase) + b*SCALAR_RE(phase); - } - for (i = 0; i < H.n; ++i) { - ASSIGN_MULT(H.data[i*H.p + curfield_band - 1], - H.data[i*H.p + curfield_band - 1], phase); - } + if (fabs(r) >= 0.5 * maxabs) { + maxabs_index = i; + maxabs_sign = r < 0 ? -1 : 1; + break; + } + } + if (i >= 0) /* convert index to global index in distributed array: */ + maxabs_index += mdata->local_y_start * mdata->nx * mdata->nz; + { + /* compute maximum index and corresponding sign over all the + processors, using the MPI_MAXLOC reduction operation: */ + struct twoint_struct { + int i; + int s; + } x; + x.i = maxabs_index; + x.s = maxabs_sign; + mpi_allreduce_1(&x, struct twoint_struct, MPI_2INT, MPI_MAXLOC, mpb_comm); + maxabs_index = x.i; + maxabs_sign = x.s; + } + ASSIGN_SCALAR(phase, SCALAR_RE(phase) * maxabs_sign, SCALAR_IM(phase) * maxabs_sign); + + mpi_one_printf("Fixing %c-field (band %d) phase by %g + %gi; " + "max ampl. = %g\n", + curfield_type, curfield_band, SCALAR_RE(phase), SCALAR_IM(phase), maxabs); + + /* Now, multiply everything by this phase, *including* the + stored "raw" eigenvector in H, so that any future fields + that we compute will have a consistent phase: */ + for (i = 0; i < N; ++i) { + real a, b; + a = curfield[i].re; + b = curfield[i].im; + curfield[i].re = a * SCALAR_RE(phase) - b * SCALAR_IM(phase); + curfield[i].im = a * SCALAR_IM(phase) + b * SCALAR_RE(phase); + } + for (i = 0; i < H.n; ++i) { + ASSIGN_MULT(H.data[i * H.p + curfield_band - 1], H.data[i * H.p + curfield_band - 1], phase); + } } /**************************************************************************/ @@ -549,366 +522,348 @@ void fix_field_phase(void) /* get_val returns process-specific output for HAVE_MPI: if the "point" (ix, iy, iz; stride) is on the local process, the value of data at that point is returned returned; otherwise - (i.e. point is not on local process) 0.0 is returned: calls to get_val should therefore + (i.e. point is not on local process) 0.0 is returned: calls to get_val should therefore be followed by sum-reduction via mpi_allreduce_1(..) in the caller (as in interp_val) */ -static real get_val(int ix, int iy, int iz, - int nx, int ny, int nz, int last_dim_size, - real *data, int stride, int conjugate) -{ +static real get_val(int ix, int iy, int iz, int nx, int ny, int nz, int last_dim_size, real *data, + int stride, int conjugate) { #ifndef SCALAR_COMPLEX - { - int nlast = last_dim_size / 2; - if ((nz > 1 ? iz : (ny > 1 ? iy : ix)) >= nlast) { - ix = ix ? nx - ix : ix; - iy = iy ? ny - iy : iy; - iz = iz ? nz - iz : iz; - conjugate = conjugate ? 1 : 0; - } - else - conjugate = 0; - if (nz > 1) nz = nlast; else if (ny > 1) ny = nlast; else nx = nlast; - } + { + int nlast = last_dim_size / 2; + if ((nz > 1 ? iz : (ny > 1 ? iy : ix)) >= nlast) { + ix = ix ? nx - ix : ix; + iy = iy ? ny - iy : iy; + iz = iz ? nz - iz : iz; + conjugate = conjugate ? 1 : 0; + } + else + conjugate = 0; + if (nz > 1) + nz = nlast; + else if (ny > 1) + ny = nlast; + else + nx = nlast; + } #else - conjugate = 0; + conjugate = 0; #endif #ifdef HAVE_MPI - /* due to real-space xy=>yx transposition in MPI configuration, we need to - do a little extra work here; see details e.g. in XYZ_LOOP macro */ - int local_ny = mdata->local_ny; /* dim of local process over y-indices */ - int local_y_start = mdata->local_y_start; - int local_iy = iy - local_y_start; - real val = 0; /* reduce local processes over this variable later */ - - /* check if local_iy is in the current process' data block */ - if (local_iy >= 0 && local_iy < local_ny) { - val = data[(((local_iy * nx) + ix) * nz + iz) * stride]; /* note transposition in x and y indices */ - } + /* due to real-space xy=>yx transposition in MPI configuration, we need to + do a little extra work here; see details e.g. in XYZ_LOOP macro */ + int local_ny = mdata->local_ny; /* dim of local process over y-indices */ + int local_y_start = mdata->local_y_start; + int local_iy = iy - local_y_start; + real val = 0; /* reduce local processes over this variable later */ + + /* check if local_iy is in the current process' data block */ + if (local_iy >= 0 && local_iy < local_ny) { + val = data[(((local_iy * nx) + ix) * nz + iz) * + stride]; /* note transposition in x and y indices */ + } #else /* no MPI */ - real val = data[(((ix * ny) + iy) * nz + iz) * stride]; + real val = data[(((ix * ny) + iy) * nz + iz) * stride]; #endif -if (conjugate) - return -val; -else - return val; + if (conjugate) + return -val; + else + return val; } -static real interp_val(vector3 p, int nx, int ny, int nz, int last_dim_size, - real *data, int stride, int conjugate) -{ - double ipart; - real rx, ry, rz, dx, dy, dz, v; - int x, y, z, x2, y2, z2; - - rx = modf(p.x/geometry_lattice.size.x + 0.5, &ipart); if (rx < 0) rx += 1; - ry = modf(p.y/geometry_lattice.size.y + 0.5, &ipart); if (ry < 0) ry += 1; - rz = modf(p.z/geometry_lattice.size.z + 0.5, &ipart); if (rz < 0) rz += 1; - - /* get the point corresponding to r in the grid: */ - x = rx * nx; - y = ry * ny; - z = rz * nz; - - /* get the difference between (x,y,z) and the actual point */ - dx = rx * nx - x; - dy = ry * ny - y; - dz = rz * nz - z; - - /* get the other closest point in the grid, with periodic boundaries: */ - x2 = (nx + (dx >= 0.0 ? x + 1 : x - 1)) % nx; - y2 = (ny + (dy >= 0.0 ? y + 1 : y - 1)) % ny; - z2 = (nz + (dz >= 0.0 ? z + 1 : z - 1)) % nz; - - /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ - dx = fabs(dx); - dy = fabs(dy); - dz = fabs(dz); - -#define D(x,y,z) (get_val(x,y,z,nx,ny,nz,last_dim_size, data,stride,conjugate)) - - v = (((D(x,y,z) * (1.0-dx) + D(x2,y,z) * dx) * (1.0-dy) + - (D(x,y2,z) * (1.0-dx) + D(x2,y2,z) * dx) * dy ) * (1.0-dz) + - ((D(x,y,z2) * (1.0-dx) + D(x2,y,z2) * dx) * (1.0-dy) + - (D(x,y2,z2) * (1.0-dx) + D(x2,y2,z2) * dx) * dy ) * dz); - - mpi_allreduce_1(&v, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - return v; +static real interp_val(vector3 p, int nx, int ny, int nz, int last_dim_size, real *data, int stride, + int conjugate) { + double ipart; + real rx, ry, rz, dx, dy, dz, v; + int x, y, z, x2, y2, z2; + + rx = modf(p.x / geometry_lattice.size.x + 0.5, &ipart); + if (rx < 0) rx += 1; + ry = modf(p.y / geometry_lattice.size.y + 0.5, &ipart); + if (ry < 0) ry += 1; + rz = modf(p.z / geometry_lattice.size.z + 0.5, &ipart); + if (rz < 0) rz += 1; + + /* get the point corresponding to r in the grid: */ + x = rx * nx; + y = ry * ny; + z = rz * nz; + + /* get the difference between (x,y,z) and the actual point */ + dx = rx * nx - x; + dy = ry * ny - y; + dz = rz * nz - z; + + /* get the other closest point in the grid, with periodic boundaries: */ + x2 = (nx + (dx >= 0.0 ? x + 1 : x - 1)) % nx; + y2 = (ny + (dy >= 0.0 ? y + 1 : y - 1)) % ny; + z2 = (nz + (dz >= 0.0 ? z + 1 : z - 1)) % nz; + + /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ + dx = fabs(dx); + dy = fabs(dy); + dz = fabs(dz); + +#define D(x, y, z) (get_val(x, y, z, nx, ny, nz, last_dim_size, data, stride, conjugate)) + + v = (((D(x, y, z) * (1.0 - dx) + D(x2, y, z) * dx) * (1.0 - dy) + + (D(x, y2, z) * (1.0 - dx) + D(x2, y2, z) * dx) * dy) * + (1.0 - dz) + + ((D(x, y, z2) * (1.0 - dx) + D(x2, y, z2) * dx) * (1.0 - dy) + + (D(x, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx) * dy) * + dz); + + mpi_allreduce_1(&v, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + return v; #undef D } -static scalar_complex interp_cval(vector3 p, - int nx, int ny, int nz, int last_dim_size, - real *data, int stride) -{ - scalar_complex cval; - cval.re = interp_val(p, nx,ny,nz,last_dim_size, data, stride, 0); - cval.im = interp_val(p, nx,ny,nz,last_dim_size,data + 1, stride, 1); - return cval; +static scalar_complex interp_cval(vector3 p, int nx, int ny, int nz, int last_dim_size, real *data, + int stride) { + scalar_complex cval; + cval.re = interp_val(p, nx, ny, nz, last_dim_size, data, stride, 0); + cval.im = interp_val(p, nx, ny, nz, last_dim_size, data + 1, stride, 1); + return cval; } -#define f_interp_val(p,f,data,stride,conj) interp_val(p,f->nx,f->ny,f->nz,f->last_dim_size,data,stride,conj) -#define f_interp_cval(p,f,data,stride) interp_cval(p,f->nx,f->ny,f->nz,f->last_dim_size,data,stride) +#define f_interp_val(p, f, data, stride, conj) \ + interp_val(p, f->nx, f->ny, f->nz, f->last_dim_size, data, stride, conj) +#define f_interp_cval(p, f, data, stride) \ + interp_cval(p, f->nx, f->ny, f->nz, f->last_dim_size, data, stride) -static symmetric_matrix interp_eps_inv(vector3 p) -{ - int stride = sizeof(symmetric_matrix) / sizeof(real); - symmetric_matrix eps_inv; +static symmetric_matrix interp_eps_inv(vector3 p) { + int stride = sizeof(symmetric_matrix) / sizeof(real); + symmetric_matrix eps_inv; - eps_inv.m00 = f_interp_val(p, mdata, &mdata->eps_inv->m00, stride, 0); - eps_inv.m11 = f_interp_val(p, mdata, &mdata->eps_inv->m11, stride, 0); - eps_inv.m22 = f_interp_val(p, mdata, &mdata->eps_inv->m22, stride, 0); + eps_inv.m00 = f_interp_val(p, mdata, &mdata->eps_inv->m00, stride, 0); + eps_inv.m11 = f_interp_val(p, mdata, &mdata->eps_inv->m11, stride, 0); + eps_inv.m22 = f_interp_val(p, mdata, &mdata->eps_inv->m22, stride, 0); #ifdef WITH_HERMITIAN_EPSILON - eps_inv.m01 = f_interp_cval(p, mdata, &mdata->eps_inv->m01.re, stride); - eps_inv.m02 = f_interp_cval(p, mdata, &mdata->eps_inv->m02.re, stride); - eps_inv.m12 = f_interp_cval(p, mdata, &mdata->eps_inv->m12.re, stride); + eps_inv.m01 = f_interp_cval(p, mdata, &mdata->eps_inv->m01.re, stride); + eps_inv.m02 = f_interp_cval(p, mdata, &mdata->eps_inv->m02.re, stride); + eps_inv.m12 = f_interp_cval(p, mdata, &mdata->eps_inv->m12.re, stride); #else - eps_inv.m01 = f_interp_val(p, mdata, &mdata->eps_inv->m01, stride, 0); - eps_inv.m02 = f_interp_val(p, mdata, &mdata->eps_inv->m02, stride, 0); - eps_inv.m12 = f_interp_val(p, mdata, &mdata->eps_inv->m12, stride, 0); + eps_inv.m01 = f_interp_val(p, mdata, &mdata->eps_inv->m01, stride, 0); + eps_inv.m02 = f_interp_val(p, mdata, &mdata->eps_inv->m02, stride, 0); + eps_inv.m12 = f_interp_val(p, mdata, &mdata->eps_inv->m12, stride, 0); #endif - return eps_inv; + return eps_inv; } -number get_epsilon_point(vector3 p) -{ - symmetric_matrix eps_inv; - eps_inv = interp_eps_inv(p); - return mean_medium_from_matrix(&eps_inv); +number get_epsilon_point(vector3 p) { + symmetric_matrix eps_inv; + eps_inv = interp_eps_inv(p); + return mean_medium_from_matrix(&eps_inv); } -cmatrix3x3 get_epsilon_inverse_tensor_point(vector3 p) -{ - symmetric_matrix eps_inv; - eps_inv = interp_eps_inv(p); +cmatrix3x3 get_epsilon_inverse_tensor_point(vector3 p) { + symmetric_matrix eps_inv; + eps_inv = interp_eps_inv(p); #ifdef WITH_HERMITIAN_EPSILON - return make_hermitian_cmatrix3x3(eps_inv.m00,eps_inv.m11,eps_inv.m22, - cscalar2cnumber(eps_inv.m01), - cscalar2cnumber(eps_inv.m02), - cscalar2cnumber(eps_inv.m12)); + return make_hermitian_cmatrix3x3(eps_inv.m00, eps_inv.m11, eps_inv.m22, + cscalar2cnumber(eps_inv.m01), cscalar2cnumber(eps_inv.m02), + cscalar2cnumber(eps_inv.m12)); #else - return make_hermitian_cmatrix3x3(eps_inv.m00,eps_inv.m11,eps_inv.m22, - make_cnumber(eps_inv.m01,0), - make_cnumber(eps_inv.m02,0), - make_cnumber(eps_inv.m12,0)); + return make_hermitian_cmatrix3x3(eps_inv.m00, eps_inv.m11, eps_inv.m22, + make_cnumber(eps_inv.m01, 0), make_cnumber(eps_inv.m02, 0), + make_cnumber(eps_inv.m12, 0)); #endif } -number get_energy_point(vector3 p) -{ - CHECK(curfield && strchr("DHBR", curfield_type), - "compute-field-energy must be called before get-energy-point"); - return f_interp_val(p, mdata, (real *) curfield, 1, 0); +number get_energy_point(vector3 p) { + CHECK(curfield && strchr("DHBR", curfield_type), + "compute-field-energy must be called before get-energy-point"); + return f_interp_val(p, mdata, (real *)curfield, 1, 0); } -cvector3 get_bloch_field_point(vector3 p) -{ - scalar_complex field[3]; - cvector3 F; - - CHECK(curfield && strchr("dhbecv", curfield_type), - "field must be must be loaded before get-*field*-point"); - field[0] = f_interp_cval(p, mdata, &curfield[0].re, 6); - field[1] = f_interp_cval(p, mdata, &curfield[1].re, 6); - field[2] = f_interp_cval(p, mdata, &curfield[2].re, 6); - F.x = cscalar2cnumber(field[0]); - F.y = cscalar2cnumber(field[1]); - F.z = cscalar2cnumber(field[2]); - return F; +cvector3 get_bloch_field_point(vector3 p) { + scalar_complex field[3]; + cvector3 F; + + CHECK(curfield && strchr("dhbecv", curfield_type), + "field must be must be loaded before get-*field*-point"); + field[0] = f_interp_cval(p, mdata, &curfield[0].re, 6); + field[1] = f_interp_cval(p, mdata, &curfield[1].re, 6); + field[2] = f_interp_cval(p, mdata, &curfield[2].re, 6); + F.x = cscalar2cnumber(field[0]); + F.y = cscalar2cnumber(field[1]); + F.z = cscalar2cnumber(field[2]); + return F; } -cvector3 get_field_point(vector3 p) -{ - scalar_complex field[3], phase; - cvector3 F; - - CHECK(curfield && strchr("dhbecv", curfield_type), - "field must be must be loaded before get-*field*-point"); - field[0] = f_interp_cval(p, mdata, &curfield[0].re, 6); - field[1] = f_interp_cval(p, mdata, &curfield[1].re, 6); - field[2] = f_interp_cval(p, mdata, &curfield[2].re, 6); - - if (curfield_type != 'v') { - double phase_phi = TWOPI * - (cur_kvector.x * (p.x/geometry_lattice.size.x) + - cur_kvector.y * (p.y/geometry_lattice.size.y) + - cur_kvector.z * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT(field[0], field[0], phase); - CASSIGN_MULT(field[1], field[1], phase); - CASSIGN_MULT(field[2], field[2], phase); - } - - F.x = cscalar2cnumber(field[0]); - F.y = cscalar2cnumber(field[1]); - F.z = cscalar2cnumber(field[2]); - return F; +cvector3 get_field_point(vector3 p) { + scalar_complex field[3], phase; + cvector3 F; + + CHECK(curfield && strchr("dhbecv", curfield_type), + "field must be must be loaded before get-*field*-point"); + field[0] = f_interp_cval(p, mdata, &curfield[0].re, 6); + field[1] = f_interp_cval(p, mdata, &curfield[1].re, 6); + field[2] = f_interp_cval(p, mdata, &curfield[2].re, 6); + + if (curfield_type != 'v') { + double phase_phi = TWOPI * (cur_kvector.x * (p.x / geometry_lattice.size.x) + + cur_kvector.y * (p.y / geometry_lattice.size.y) + + cur_kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT(field[0], field[0], phase); + CASSIGN_MULT(field[1], field[1], phase); + CASSIGN_MULT(field[2], field[2], phase); + } + + F.x = cscalar2cnumber(field[0]); + F.y = cscalar2cnumber(field[1]); + F.z = cscalar2cnumber(field[2]); + return F; } -cnumber get_bloch_cscalar_point(vector3 p) -{ - CHECK(curfield && strchr("C", curfield_type), - "cscalar must be must be loaded before get-*cscalar*-point"); +cnumber get_bloch_cscalar_point(vector3 p) { + CHECK(curfield && strchr("C", curfield_type), + "cscalar must be must be loaded before get-*cscalar*-point"); - return cscalar2cnumber(f_interp_cval(p, mdata, &curfield[0].re, 2)); + return cscalar2cnumber(f_interp_cval(p, mdata, &curfield[0].re, 2)); } -cnumber get_cscalar_point(vector3 p) -{ - scalar_complex s; +cnumber get_cscalar_point(vector3 p) { + scalar_complex s; - CHECK(curfield && strchr("C", curfield_type), - "cscalar must be must be loaded before get-*cscalar*-point"); + CHECK(curfield && strchr("C", curfield_type), + "cscalar must be must be loaded before get-*cscalar*-point"); - s = f_interp_cval(p, mdata, &curfield[0].re, 2); + s = f_interp_cval(p, mdata, &curfield[0].re, 2); - if (curfield_type == 'C') { - scalar_complex phase; - double phase_phi = TWOPI * - (cur_kvector.x * (p.x/geometry_lattice.size.x) + - cur_kvector.y * (p.y/geometry_lattice.size.y) + - cur_kvector.z * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT(s, s, phase); - } + if (curfield_type == 'C') { + scalar_complex phase; + double phase_phi = TWOPI * (cur_kvector.x * (p.x / geometry_lattice.size.x) + + cur_kvector.y * (p.y / geometry_lattice.size.y) + + cur_kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT(s, s, phase); + } - return cscalar2cnumber(s); + return cscalar2cnumber(s); } -number rscalar_field_get_point(SCM fo, vector3 p) -{ - field_smob *f = assert_field_smob(fo); - CHECK(f->type == RSCALAR_FIELD_SMOB, - "invalid argument to rscalar-field-get-point"); - return f_interp_val(p, f, f->f.rs, 1, 0); +number rscalar_field_get_point(SCM fo, vector3 p) { + field_smob *f = assert_field_smob(fo); + CHECK(f->type == RSCALAR_FIELD_SMOB, "invalid argument to rscalar-field-get-point"); + return f_interp_val(p, f, f->f.rs, 1, 0); } -cvector3 cvector_field_get_point_bloch(SCM fo, vector3 p) -{ - scalar_complex field[3]; - cvector3 F; - field_smob *f = assert_field_smob(fo); - CHECK(f->type == CVECTOR_FIELD_SMOB, - "invalid argument to cvector-field-get-point"); - field[0] = f_interp_cval(p, f, &f->f.cv[0].re, 6); - field[1] = f_interp_cval(p, f, &f->f.cv[1].re, 6); - field[2] = f_interp_cval(p, f, &f->f.cv[2].re, 6); - F.x = cscalar2cnumber(field[0]); - F.y = cscalar2cnumber(field[1]); - F.z = cscalar2cnumber(field[2]); - return F; +cvector3 cvector_field_get_point_bloch(SCM fo, vector3 p) { + scalar_complex field[3]; + cvector3 F; + field_smob *f = assert_field_smob(fo); + CHECK(f->type == CVECTOR_FIELD_SMOB, "invalid argument to cvector-field-get-point"); + field[0] = f_interp_cval(p, f, &f->f.cv[0].re, 6); + field[1] = f_interp_cval(p, f, &f->f.cv[1].re, 6); + field[2] = f_interp_cval(p, f, &f->f.cv[2].re, 6); + F.x = cscalar2cnumber(field[0]); + F.y = cscalar2cnumber(field[1]); + F.z = cscalar2cnumber(field[2]); + return F; } -cvector3 cvector_field_get_point(SCM fo, vector3 p) -{ - scalar_complex field[3]; - cvector3 F; - field_smob *f = assert_field_smob(fo); - CHECK(f->type == CVECTOR_FIELD_SMOB, - "invalid argument to cvector-field-get-point"); - - field[0] = f_interp_cval(p, f, &f->f.cv[0].re, 6); - field[1] = f_interp_cval(p, f, &f->f.cv[1].re, 6); - field[2] = f_interp_cval(p, f, &f->f.cv[2].re, 6); - - if (f->type_char != 'v') { /* v fields have no kvector */ - scalar_complex phase; - double phase_phi = TWOPI * - (cur_kvector.x * (p.x/geometry_lattice.size.x) + - cur_kvector.y * (p.y/geometry_lattice.size.y) + - cur_kvector.z * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT(field[0], field[0], phase); - CASSIGN_MULT(field[1], field[1], phase); - CASSIGN_MULT(field[2], field[2], phase); - } - - F.x = cscalar2cnumber(field[0]); - F.y = cscalar2cnumber(field[1]); - F.z = cscalar2cnumber(field[2]); - return F; +cvector3 cvector_field_get_point(SCM fo, vector3 p) { + scalar_complex field[3]; + cvector3 F; + field_smob *f = assert_field_smob(fo); + CHECK(f->type == CVECTOR_FIELD_SMOB, "invalid argument to cvector-field-get-point"); + + field[0] = f_interp_cval(p, f, &f->f.cv[0].re, 6); + field[1] = f_interp_cval(p, f, &f->f.cv[1].re, 6); + field[2] = f_interp_cval(p, f, &f->f.cv[2].re, 6); + + if (f->type_char != 'v') { /* v fields have no kvector */ + scalar_complex phase; + double phase_phi = TWOPI * (cur_kvector.x * (p.x / geometry_lattice.size.x) + + cur_kvector.y * (p.y / geometry_lattice.size.y) + + cur_kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT(field[0], field[0], phase); + CASSIGN_MULT(field[1], field[1], phase); + CASSIGN_MULT(field[2], field[2], phase); + } + + F.x = cscalar2cnumber(field[0]); + F.y = cscalar2cnumber(field[1]); + F.z = cscalar2cnumber(field[2]); + return F; } -cnumber cscalar_field_get_point_bloch(SCM fo, vector3 p) -{ - field_smob *f = assert_field_smob(fo); - CHECK(f->type == CSCALAR_FIELD_SMOB, - "invalid argument to cscalar-field-get-point-bloch"); - return cscalar2cnumber(f_interp_cval(p, f, &f->f.cv[0].re, 2)); +cnumber cscalar_field_get_point_bloch(SCM fo, vector3 p) { + field_smob *f = assert_field_smob(fo); + CHECK(f->type == CSCALAR_FIELD_SMOB, "invalid argument to cscalar-field-get-point-bloch"); + return cscalar2cnumber(f_interp_cval(p, f, &f->f.cv[0].re, 2)); } -cnumber cscalar_field_get_point(SCM fo, vector3 p) -{ - scalar_complex s; - field_smob *f = assert_field_smob(fo); - CHECK(f->type == CSCALAR_FIELD_SMOB, - "invalid argument to cscalar-field-get-point"); - - s = f_interp_cval(p, f, &f->f.cv[0].re, 2); - - if (f->type_char == 'C') { /* have kvector */ - scalar_complex phase; - double phase_phi = TWOPI * - (cur_kvector.x * (p.x/geometry_lattice.size.x) + - cur_kvector.y * (p.y/geometry_lattice.size.y) + - cur_kvector.z * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT(s, s, phase); - } - - return cscalar2cnumber(s); +cnumber cscalar_field_get_point(SCM fo, vector3 p) { + scalar_complex s; + field_smob *f = assert_field_smob(fo); + CHECK(f->type == CSCALAR_FIELD_SMOB, "invalid argument to cscalar-field-get-point"); + + s = f_interp_cval(p, f, &f->f.cv[0].re, 2); + + if (f->type_char == 'C') { /* have kvector */ + scalar_complex phase; + double phase_phi = TWOPI * (cur_kvector.x * (p.x / geometry_lattice.size.x) + + cur_kvector.y * (p.y / geometry_lattice.size.y) + + cur_kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT(s, s, phase); + } + + return cscalar2cnumber(s); } /**************************************************************************/ /* compute the fraction of the field energy that is located in the given range of dielectric constants: */ -number compute_energy_in_dielectric(number eps_low, number eps_high) -{ - int N, i, last_dim, last_dim_stored, nx, nz, local_y_start; - real *energy = (real *) curfield; - real epsilon, energy_sum = 0.0; - - if (!curfield || !strchr("DHBR", curfield_type)) { - mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); - return 0.0; - } - - N = mdata->fft_output_size; - last_dim = mdata->last_dim; - last_dim_stored = - mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - nx = mdata->nx; nz = mdata->nz; local_y_start = mdata->local_y_start; - - for (i = 0; i < N; ++i) { - epsilon = mean_medium_from_matrix(mdata->eps_inv +i); - if (epsilon >= eps_low && epsilon <= eps_high) { - energy_sum += energy[i]; +number compute_energy_in_dielectric(number eps_low, number eps_high) { + int N, i, last_dim, last_dim_stored, nx, nz, local_y_start; + real *energy = (real *)curfield; + real epsilon, energy_sum = 0.0; + + if (!curfield || !strchr("DHBR", curfield_type)) { + mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); + return 0.0; + } + + N = mdata->fft_output_size; + last_dim = mdata->last_dim; + last_dim_stored = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + nx = mdata->nx; + nz = mdata->nz; + local_y_start = mdata->local_y_start; + + for (i = 0; i < N; ++i) { + epsilon = mean_medium_from_matrix(mdata->eps_inv + i); + if (epsilon >= eps_low && epsilon <= eps_high) { + energy_sum += energy[i]; #ifndef SCALAR_COMPLEX - /* most points are counted twice, by rfftw output symmetry: */ - { - int last_index; -# ifdef HAVE_MPI - if (nz == 1) /* 2d: 1st dim. is truncated one */ - last_index = i / nx + local_y_start; - else - last_index = i % last_dim_stored; -# else - last_index = i % last_dim_stored; -# endif - if (last_index != 0 && 2*last_index != last_dim) - energy_sum += energy[i]; - } + /* most points are counted twice, by rfftw output symmetry: */ + { + int last_index; +#ifdef HAVE_MPI + if (nz == 1) /* 2d: 1st dim. is truncated one */ + last_index = i / nx + local_y_start; + else + last_index = i % last_dim_stored; +#else + last_index = i % last_dim_stored; +#endif + if (last_index != 0 && 2 * last_index != last_dim) energy_sum += energy[i]; + } #endif - } - } - mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - energy_sum *= Vol / H.N; - return energy_sum; + } + } + mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + energy_sum *= Vol / H.N; + return energy_sum; } /**************************************************************************/ @@ -917,383 +872,331 @@ number compute_energy_in_dielectric(number eps_low, number eps_high) append a parity specifier (if any) (e.g. ".te"), returning a new string, which should be deallocated with free(). fname or prefix may be NULL, in which case they are treated as the empty string. */ -static char *fix_fname(const char *fname, const char *prefix, - maxwell_data *d, int parity_suffix) -{ - char *s; - CHK_MALLOC(s, char, - (fname ? strlen(fname) : 0) + - (prefix ? strlen(prefix) : 0) + 20); - strcpy(s, prefix ? prefix : ""); - strcat(s, fname ? fname : ""); - if (parity_suffix && d->parity != NO_PARITY) { - /* assumes parity suffix is less than 20 characters; - currently it is less than 12 */ - strcat(s, "."); - strcat(s, parity_string(d)); - } - return s; +static char *fix_fname(const char *fname, const char *prefix, maxwell_data *d, int parity_suffix) { + char *s; + CHK_MALLOC(s, char, (fname ? strlen(fname) : 0) + (prefix ? strlen(prefix) : 0) + 20); + strcpy(s, prefix ? prefix : ""); + strcat(s, fname ? fname : ""); + if (parity_suffix && d->parity != NO_PARITY) { + /* assumes parity suffix is less than 20 characters; + currently it is less than 12 */ + strcat(s, "."); + strcat(s, parity_string(d)); + } + return s; } -static void output_scalarfield(real *vals, - const int dims[3], - const int local_dims[3], - const int start[3], - matrixio_id file_id, - const char *dataname, - int last_dim_index, - int last_dim_start, int last_dim_size, - int first_dim_start, int first_dim_size, - int write_start0_special) -{ - matrixio_id data_id = {-1, 1}; +static void output_scalarfield(real *vals, const int dims[3], const int local_dims[3], + const int start[3], matrixio_id file_id, const char *dataname, + int last_dim_index, int last_dim_start, int last_dim_size, + int first_dim_start, int first_dim_size, int write_start0_special) { + matrixio_id data_id = {-1, 1}; - fieldio_write_real_vals(vals, 3, dims, - local_dims, start, file_id, 0, - dataname, &data_id); + fieldio_write_real_vals(vals, 3, dims, local_dims, start, file_id, 0, dataname, &data_id); #ifndef SCALAR_COMPLEX - { - int start_new[3], local_dims_new[3]; - - start_new[0] = start[0]; - start_new[1] = start[1]; - start_new[2] = start[2]; - local_dims_new[0] = local_dims[0]; - local_dims_new[1] = local_dims[1]; - local_dims_new[2] = local_dims[2]; - - maxwell_scalarfield_otherhalf(mdata, vals); - start_new[last_dim_index] = last_dim_start; - local_dims_new[last_dim_index] = last_dim_size; - start_new[0] = first_dim_start; - local_dims_new[0] = first_dim_size; - if (write_start0_special) { - /* The conjugated array half may be discontiguous. - First, write the part not containing start_new[0], and - then write the start_new[0] slab. */ - fieldio_write_real_vals(vals + - local_dims_new[1] * local_dims_new[2], - 3, dims, local_dims_new, start_new, - file_id, 1, dataname, &data_id); - local_dims_new[0] = 1; - start_new[0] = 0; - fieldio_write_real_vals(vals, 3, dims, - local_dims_new, start_new, - file_id, 1, dataname, &data_id); - } - else { - fieldio_write_real_vals(vals, 3, dims, - local_dims_new, start_new, - file_id, 1, dataname, &data_id); - } - } + { + int start_new[3], local_dims_new[3]; + + start_new[0] = start[0]; + start_new[1] = start[1]; + start_new[2] = start[2]; + local_dims_new[0] = local_dims[0]; + local_dims_new[1] = local_dims[1]; + local_dims_new[2] = local_dims[2]; + + maxwell_scalarfield_otherhalf(mdata, vals); + start_new[last_dim_index] = last_dim_start; + local_dims_new[last_dim_index] = last_dim_size; + start_new[0] = first_dim_start; + local_dims_new[0] = first_dim_size; + if (write_start0_special) { + /* The conjugated array half may be discontiguous. + First, write the part not containing start_new[0], and + then write the start_new[0] slab. */ + fieldio_write_real_vals(vals + local_dims_new[1] * local_dims_new[2], 3, dims, local_dims_new, + start_new, file_id, 1, dataname, &data_id); + local_dims_new[0] = 1; + start_new[0] = 0; + fieldio_write_real_vals(vals, 3, dims, local_dims_new, start_new, file_id, 1, dataname, + &data_id); + } + else { + fieldio_write_real_vals(vals, 3, dims, local_dims_new, start_new, file_id, 1, dataname, + &data_id); + } + } #endif - if (data_id.id >= 0) - matrixio_close_dataset(data_id); + if (data_id.id >= 0) matrixio_close_dataset(data_id); } /* given the field in curfield, store it to HDF (or whatever) using the matrixio (fieldio) routines. Allow the component to be specified (which_component 0/1/2 = x/y/z, -1 = all) for vector fields. Also allow the user to specify a prefix string for the filename. */ -void output_field_to_file(integer which_component, string filename_prefix) -{ - char fname[100], *fname2, description[100]; - int dims[3], local_dims[3], start[3] = {0,0,0}; - matrixio_id file_id = {-1,1}; - int attr_dims[2] = {3, 3}; - real output_k[3]; /* kvector in reciprocal lattice basis */ - real output_R[3][3]; - - /* where to put "otherhalf" block of output, only used for real scalars */ - int last_dim_index = 0; - int last_dim_start = 0, last_dim_size = 0; - int first_dim_start = 0, first_dim_size = 0; - int write_start0_special = 0; - - if (!curfield) { - mpi_one_fprintf(stderr, - "fields, energy dens., or epsilon must be loaded first.\n"); - return; - } +void output_field_to_file(integer which_component, string filename_prefix) { + char fname[100], *fname2, description[100]; + int dims[3], local_dims[3], start[3] = {0, 0, 0}; + matrixio_id file_id = {-1, 1}; + int attr_dims[2] = {3, 3}; + real output_k[3]; /* kvector in reciprocal lattice basis */ + real output_R[3][3]; + + /* where to put "otherhalf" block of output, only used for real scalars */ + int last_dim_index = 0; + int last_dim_start = 0, last_dim_size = 0; + int first_dim_start = 0, first_dim_size = 0; + int write_start0_special = 0; + + if (!curfield) { + mpi_one_fprintf(stderr, "fields, energy dens., or epsilon must be loaded first.\n"); + return; + } #ifdef HAVE_MPI - /* The first two dimensions (x and y) of the position-space fields - are transposed when we use MPI, so we need to transpose everything. */ - dims[0] = mdata->ny; - local_dims[1] = dims[1] = mdata->nx; - local_dims[2] = dims[2] = mdata->nz; - local_dims[0] = mdata->local_ny; - start[0] = mdata->local_y_start; -# ifndef SCALAR_COMPLEX - /* Ugh, hairy. See also maxwell_vectorfield_otherhalf. */ - if (dims[2] == 1) { - last_dim_index = 0; - first_dim_size = local_dims[0]; - first_dim_start = dims[0] - (start[0] + local_dims[0] - 1); - - if (start[0] == 0) - --first_dim_size; /* DC frequency is not in other half */ - if (start[0] + local_dims[0] == mdata->last_dim_size / 2 && - dims[0] % 2 == 0) { - --first_dim_size; /* Nyquist frequency is not in other half */ - ++first_dim_start; - } - - last_dim_start = first_dim_start; - last_dim_size = first_dim_size; - } - else { - last_dim_index = 2; - local_dims[last_dim_index] = mdata->last_dim_size / 2; - if (start[0] == 0) { - first_dim_size = local_dims[0] - 1; - first_dim_start = dims[0] - first_dim_size; - write_start0_special = 1; - } - else { - first_dim_start = dims[0] - (start[0] + local_dims[0] - 1); - first_dim_size = local_dims[0]; - } - last_dim_start = local_dims[last_dim_index]; - last_dim_size = dims[last_dim_index] - local_dims[last_dim_index]; - } -# endif /* ! SCALAR_COMPLEX */ - output_k[0] = R[1][0]*mdata->current_k[0] + R[1][1]*mdata->current_k[1] - + R[1][2]*mdata->current_k[2]; - output_k[1] = R[0][0]*mdata->current_k[0] + R[0][1]*mdata->current_k[1] - + R[0][2]*mdata->current_k[2]; - output_k[2] = R[2][0]*mdata->current_k[0] + R[2][1]*mdata->current_k[1] - + R[2][2]*mdata->current_k[2]; - output_R[0][0]=R[1][0]; output_R[0][1]=R[1][1]; output_R[0][2]=R[1][2]; - output_R[1][0]=R[0][0]; output_R[1][1]=R[0][1]; output_R[1][2]=R[0][2]; - output_R[2][0]=R[2][0]; output_R[2][1]=R[2][1]; output_R[2][2]=R[2][2]; + /* The first two dimensions (x and y) of the position-space fields + are transposed when we use MPI, so we need to transpose everything. */ + dims[0] = mdata->ny; + local_dims[1] = dims[1] = mdata->nx; + local_dims[2] = dims[2] = mdata->nz; + local_dims[0] = mdata->local_ny; + start[0] = mdata->local_y_start; +#ifndef SCALAR_COMPLEX + /* Ugh, hairy. See also maxwell_vectorfield_otherhalf. */ + if (dims[2] == 1) { + last_dim_index = 0; + first_dim_size = local_dims[0]; + first_dim_start = dims[0] - (start[0] + local_dims[0] - 1); + + if (start[0] == 0) --first_dim_size; /* DC frequency is not in other half */ + if (start[0] + local_dims[0] == mdata->last_dim_size / 2 && dims[0] % 2 == 0) { + --first_dim_size; /* Nyquist frequency is not in other half */ + ++first_dim_start; + } + + last_dim_start = first_dim_start; + last_dim_size = first_dim_size; + } + else { + last_dim_index = 2; + local_dims[last_dim_index] = mdata->last_dim_size / 2; + if (start[0] == 0) { + first_dim_size = local_dims[0] - 1; + first_dim_start = dims[0] - first_dim_size; + write_start0_special = 1; + } + else { + first_dim_start = dims[0] - (start[0] + local_dims[0] - 1); + first_dim_size = local_dims[0]; + } + last_dim_start = local_dims[last_dim_index]; + last_dim_size = dims[last_dim_index] - local_dims[last_dim_index]; + } +#endif /* ! SCALAR_COMPLEX */ + output_k[0] = + R[1][0] * mdata->current_k[0] + R[1][1] * mdata->current_k[1] + R[1][2] * mdata->current_k[2]; + output_k[1] = + R[0][0] * mdata->current_k[0] + R[0][1] * mdata->current_k[1] + R[0][2] * mdata->current_k[2]; + output_k[2] = + R[2][0] * mdata->current_k[0] + R[2][1] * mdata->current_k[1] + R[2][2] * mdata->current_k[2]; + output_R[0][0] = R[1][0]; + output_R[0][1] = R[1][1]; + output_R[0][2] = R[1][2]; + output_R[1][0] = R[0][0]; + output_R[1][1] = R[0][1]; + output_R[1][2] = R[0][2]; + output_R[2][0] = R[2][0]; + output_R[2][1] = R[2][1]; + output_R[2][2] = R[2][2]; #else /* ! HAVE_MPI */ - dims[0] = mdata->nx; - local_dims[1] = dims[1] = mdata->ny; - local_dims[2] = dims[2] = mdata->nz; - local_dims[0] = mdata->local_nx; -# ifndef SCALAR_COMPLEX - last_dim_index = dims[2] == 1 ? (dims[1] == 1 ? 0 : 1) : 2; - local_dims[last_dim_index] = mdata->last_dim_size / 2; - last_dim_start = local_dims[last_dim_index]; - last_dim_size = dims[last_dim_index] - local_dims[last_dim_index]; - first_dim_start = last_dim_index ? 0 : last_dim_start; - first_dim_size = last_dim_index ? local_dims[0] : last_dim_size; -# endif - start[0] = mdata->local_x_start; - output_k[0] = R[0][0]*mdata->current_k[0] + R[0][1]*mdata->current_k[1] - + R[0][2]*mdata->current_k[2]; - output_k[1] = R[1][0]*mdata->current_k[0] + R[1][1]*mdata->current_k[1] - + R[1][2]*mdata->current_k[2]; - output_k[2] = R[2][0]*mdata->current_k[0] + R[2][1]*mdata->current_k[1] - + R[2][2]*mdata->current_k[2]; - output_R[0][0]=R[0][0]; output_R[0][1]=R[0][1]; output_R[0][2]=R[0][2]; - output_R[1][0]=R[1][0]; output_R[1][1]=R[1][1]; output_R[1][2]=R[1][2]; - output_R[2][0]=R[2][0]; output_R[2][1]=R[2][1]; output_R[2][2]=R[2][2]; + dims[0] = mdata->nx; + local_dims[1] = dims[1] = mdata->ny; + local_dims[2] = dims[2] = mdata->nz; + local_dims[0] = mdata->local_nx; +#ifndef SCALAR_COMPLEX + last_dim_index = dims[2] == 1 ? (dims[1] == 1 ? 0 : 1) : 2; + local_dims[last_dim_index] = mdata->last_dim_size / 2; + last_dim_start = local_dims[last_dim_index]; + last_dim_size = dims[last_dim_index] - local_dims[last_dim_index]; + first_dim_start = last_dim_index ? 0 : last_dim_start; + first_dim_size = last_dim_index ? local_dims[0] : last_dim_size; +#endif + start[0] = mdata->local_x_start; + output_k[0] = + R[0][0] * mdata->current_k[0] + R[0][1] * mdata->current_k[1] + R[0][2] * mdata->current_k[2]; + output_k[1] = + R[1][0] * mdata->current_k[0] + R[1][1] * mdata->current_k[1] + R[1][2] * mdata->current_k[2]; + output_k[2] = + R[2][0] * mdata->current_k[0] + R[2][1] * mdata->current_k[1] + R[2][2] * mdata->current_k[2]; + output_R[0][0] = R[0][0]; + output_R[0][1] = R[0][1]; + output_R[0][2] = R[0][2]; + output_R[1][0] = R[1][0]; + output_R[1][1] = R[1][1]; + output_R[1][2] = R[1][2]; + output_R[2][0] = R[2][0]; + output_R[2][1] = R[2][1]; + output_R[2][2] = R[2][2]; #endif /* ! HAVE_MPI */ - if (strchr("Rv", curfield_type)) /* generic scalar/vector field */ - output_k[0] = output_k[1] = output_k[2] = 0.0; /* don't know k */ - - if (strchr("dhbecv", curfield_type)) { /* outputting vector field */ - matrixio_id data_id[6] = {{-1,1},{-1,1},{-1,1},{-1,1},{-1,1},{-1,1}}; - int i; - - sprintf(fname, "%c.k%02d.b%02d", - curfield_type, kpoint_index, curfield_band); - if (which_component >= 0) { - char comp_str[] = ".x"; - comp_str[1] = 'x' + which_component; - strcat(fname, comp_str); - } - sprintf(description, "%c field, kpoint %d, band %d, freq=%g", - curfield_type, kpoint_index, curfield_band, - freqs.items[curfield_band - 1]); - fname2 = fix_fname(fname, filename_prefix, mdata, 1); - mpi_one_printf("Outputting fields to %s...\n", fname2); - file_id = matrixio_create(fname2); - free(fname2); - fieldio_write_complex_field(curfield, 3, dims, local_dims, start, - which_component, 3, output_k, - file_id, 0, data_id); + if (strchr("Rv", curfield_type)) /* generic scalar/vector field */ + output_k[0] = output_k[1] = output_k[2] = 0.0; /* don't know k */ + + if (strchr("dhbecv", curfield_type)) { /* outputting vector field */ + matrixio_id data_id[6] = {{-1, 1}, {-1, 1}, {-1, 1}, {-1, 1}, {-1, 1}, {-1, 1}}; + int i; + + sprintf(fname, "%c.k%02d.b%02d", curfield_type, kpoint_index, curfield_band); + if (which_component >= 0) { + char comp_str[] = ".x"; + comp_str[1] = 'x' + which_component; + strcat(fname, comp_str); + } + sprintf(description, "%c field, kpoint %d, band %d, freq=%g", curfield_type, kpoint_index, + curfield_band, freqs.items[curfield_band - 1]); + fname2 = fix_fname(fname, filename_prefix, mdata, 1); + mpi_one_printf("Outputting fields to %s...\n", fname2); + file_id = matrixio_create(fname2); + free(fname2); + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 3, output_k, + file_id, 0, data_id); #ifndef SCALAR_COMPLEX - /* Here's where it gets hairy. */ - maxwell_vectorfield_otherhalf(mdata, curfield, - output_k[0], output_k[1], output_k[2]); - start[last_dim_index] = last_dim_start; - local_dims[last_dim_index] = last_dim_size; - start[0] = first_dim_start; - local_dims[0] = first_dim_size; - if (write_start0_special) { - /* The conjugated array half may be discontiguous. - First, write the part not containing start[0], and - then write the start[0] slab. */ - fieldio_write_complex_field(curfield + - 3 * local_dims[1] * local_dims[2], - 3, dims, local_dims, start, - which_component, 3, NULL, - file_id, 1, data_id); - local_dims[0] = 1; - start[0] = 0; - fieldio_write_complex_field(curfield, 3, dims,local_dims,start, - which_component, 3, NULL, - file_id, 1, data_id); - } - else { - fieldio_write_complex_field(curfield, 3, dims,local_dims,start, - which_component, 3, NULL, - file_id, 1, data_id); - } + /* Here's where it gets hairy. */ + maxwell_vectorfield_otherhalf(mdata, curfield, output_k[0], output_k[1], output_k[2]); + start[last_dim_index] = last_dim_start; + local_dims[last_dim_index] = last_dim_size; + start[0] = first_dim_start; + local_dims[0] = first_dim_size; + if (write_start0_special) { + /* The conjugated array half may be discontiguous. + First, write the part not containing start[0], and + then write the start[0] slab. */ + fieldio_write_complex_field(curfield + 3 * local_dims[1] * local_dims[2], 3, dims, local_dims, + start, which_component, 3, NULL, file_id, 1, data_id); + local_dims[0] = 1; + start[0] = 0; + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 3, NULL, + file_id, 1, data_id); + } + else { + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 3, NULL, + file_id, 1, data_id); + } #endif - for (i = 0; i < 6; ++i) - if (data_id[i].id >= 0) - matrixio_close_dataset(data_id[i]); - matrixio_write_data_attr(file_id, "Bloch wavevector", - output_k, 1, attr_dims); - } - else if (strchr("C", curfield_type)) { /* outputting cmplx scalar field */ - matrixio_id data_id[2] = {{-1,1},{-1,1}}; - int i; - - sprintf(fname, "%c.k%02d.b%02d", - curfield_type, kpoint_index, curfield_band); - sprintf(description, "%c field, kpoint %d, band %d, freq=%g", - curfield_type, kpoint_index, curfield_band, - freqs.items[curfield_band - 1]); - fname2 = fix_fname(fname, filename_prefix, mdata, 1); - mpi_one_printf("Outputting complex scalar field to %s...\n", fname2); - file_id = matrixio_create(fname2); - free(fname2); - fieldio_write_complex_field(curfield, 3, dims, local_dims, start, - which_component, 1, output_k, - file_id, 0, data_id); + for (i = 0; i < 6; ++i) + if (data_id[i].id >= 0) matrixio_close_dataset(data_id[i]); + matrixio_write_data_attr(file_id, "Bloch wavevector", output_k, 1, attr_dims); + } + else if (strchr("C", curfield_type)) { /* outputting cmplx scalar field */ + matrixio_id data_id[2] = {{-1, 1}, {-1, 1}}; + int i; + + sprintf(fname, "%c.k%02d.b%02d", curfield_type, kpoint_index, curfield_band); + sprintf(description, "%c field, kpoint %d, band %d, freq=%g", curfield_type, kpoint_index, + curfield_band, freqs.items[curfield_band - 1]); + fname2 = fix_fname(fname, filename_prefix, mdata, 1); + mpi_one_printf("Outputting complex scalar field to %s...\n", fname2); + file_id = matrixio_create(fname2); + free(fname2); + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 1, output_k, + file_id, 0, data_id); #ifndef SCALAR_COMPLEX - /* Here's where it gets hairy. */ - maxwell_cscalarfield_otherhalf(mdata, curfield, - output_k[0], output_k[1], output_k[2]); - start[last_dim_index] = last_dim_start; - local_dims[last_dim_index] = last_dim_size; - start[0] = first_dim_start; - local_dims[0] = first_dim_size; - if (write_start0_special) { - /* The conjugated array half may be discontiguous. - First, write the part not containing start[0], and - then write the start[0] slab. */ - fieldio_write_complex_field(curfield + - local_dims[1] * local_dims[2], - 3, dims, local_dims, start, - which_component, 1, NULL, - file_id, 1, data_id); - local_dims[0] = 1; - start[0] = 0; - fieldio_write_complex_field(curfield, 3, dims,local_dims,start, - which_component, 1, NULL, - file_id, 1, data_id); - } - else { - fieldio_write_complex_field(curfield, 3, dims,local_dims,start, - which_component, 1, NULL, - file_id, 1, data_id); - } + /* Here's where it gets hairy. */ + maxwell_cscalarfield_otherhalf(mdata, curfield, output_k[0], output_k[1], output_k[2]); + start[last_dim_index] = last_dim_start; + local_dims[last_dim_index] = last_dim_size; + start[0] = first_dim_start; + local_dims[0] = first_dim_size; + if (write_start0_special) { + /* The conjugated array half may be discontiguous. + First, write the part not containing start[0], and + then write the start[0] slab. */ + fieldio_write_complex_field(curfield + local_dims[1] * local_dims[2], 3, dims, local_dims, + start, which_component, 1, NULL, file_id, 1, data_id); + local_dims[0] = 1; + start[0] = 0; + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 1, NULL, + file_id, 1, data_id); + } + else { + fieldio_write_complex_field(curfield, 3, dims, local_dims, start, which_component, 1, NULL, + file_id, 1, data_id); + } #endif - for (i = 0; i < 2; ++i) - if (data_id[i].id >= 0) - matrixio_close_dataset(data_id[i]); - matrixio_write_data_attr(file_id, "Bloch wavevector", - output_k, 1, attr_dims); - } - else if (strchr("DHBnmR", curfield_type)) { /* scalar field */ - if (curfield_type == 'n') { - sprintf(fname, "epsilon"); - sprintf(description, "dielectric function, epsilon"); - } - else if (curfield_type == 'm') { - sprintf(fname, "mu"); - sprintf(description, "permeability mu"); - } - else { - sprintf(fname, "%cpwr.k%02d.b%02d", - tolower(curfield_type), kpoint_index, curfield_band); - sprintf(description, - "%c field energy density, kpoint %d, band %d, freq=%g", - curfield_type, kpoint_index, curfield_band, - freqs.items[curfield_band - 1]); - } - fname2 = fix_fname(fname, filename_prefix, mdata, - /* no parity suffix for epsilon: */ - curfield_type != 'n' && curfield_type != 'm'); - mpi_one_printf("Outputting %s...\n", fname2); - file_id = matrixio_create(fname2); - free(fname2); - - output_scalarfield((real *) curfield, dims, - local_dims, start, file_id, "data", - last_dim_index, last_dim_start, last_dim_size, - first_dim_start, first_dim_size, - write_start0_special); - - if (curfield_type == 'n') { - int c1, c2, inv; - char dataname[100]; - - for (inv = 0; inv < 2; ++inv) - for (c1 = 0; c1 < 3; ++c1) - for (c2 = c1; c2 < 3; ++c2) { - get_epsilon_tensor(c1,c2, 0, inv); - sprintf(dataname, "%s.%c%c", - inv ? "epsilon_inverse" : "epsilon", - c1 + 'x', c2 + 'x'); - output_scalarfield((real *) curfield, dims, - local_dims, start, - file_id, dataname, - last_dim_index, - last_dim_start, last_dim_size, - first_dim_start, - first_dim_size, - write_start0_special); + for (i = 0; i < 2; ++i) + if (data_id[i].id >= 0) matrixio_close_dataset(data_id[i]); + matrixio_write_data_attr(file_id, "Bloch wavevector", output_k, 1, attr_dims); + } + else if (strchr("DHBnmR", curfield_type)) { /* scalar field */ + if (curfield_type == 'n') { + sprintf(fname, "epsilon"); + sprintf(description, "dielectric function, epsilon"); + } + else if (curfield_type == 'm') { + sprintf(fname, "mu"); + sprintf(description, "permeability mu"); + } + else { + sprintf(fname, "%cpwr.k%02d.b%02d", tolower(curfield_type), kpoint_index, curfield_band); + sprintf(description, "%c field energy density, kpoint %d, band %d, freq=%g", curfield_type, + kpoint_index, curfield_band, freqs.items[curfield_band - 1]); + } + fname2 = fix_fname(fname, filename_prefix, mdata, + /* no parity suffix for epsilon: */ + curfield_type != 'n' && curfield_type != 'm'); + mpi_one_printf("Outputting %s...\n", fname2); + file_id = matrixio_create(fname2); + free(fname2); + + output_scalarfield((real *)curfield, dims, local_dims, start, file_id, "data", last_dim_index, + last_dim_start, last_dim_size, first_dim_start, first_dim_size, + write_start0_special); + + if (curfield_type == 'n') { + int c1, c2, inv; + char dataname[100]; + + for (inv = 0; inv < 2; ++inv) + for (c1 = 0; c1 < 3; ++c1) + for (c2 = c1; c2 < 3; ++c2) { + get_epsilon_tensor(c1, c2, 0, inv); + sprintf(dataname, "%s.%c%c", inv ? "epsilon_inverse" : "epsilon", c1 + 'x', c2 + 'x'); + output_scalarfield((real *)curfield, dims, local_dims, start, file_id, dataname, + last_dim_index, last_dim_start, last_dim_size, first_dim_start, + first_dim_size, write_start0_special); #if defined(WITH_HERMITIAN_EPSILON) - if (c1 != c2) { - get_epsilon_tensor(c1,c2, 1, inv); - strcat(dataname, ".i"); + if (c1 != c2) { + get_epsilon_tensor(c1, c2, 1, inv); + strcat(dataname, ".i"); #ifndef SCALAR_COMPLEX /* scalarfield_otherhalf isn't right */ - strcat(dataname, ".screwy"); + strcat(dataname, ".screwy"); #endif - output_scalarfield((real *) curfield, dims, - local_dims, start, - file_id, dataname, - last_dim_index, - last_dim_start, - last_dim_size, - first_dim_start, - first_dim_size, - write_start0_special); - } + output_scalarfield((real *)curfield, dims, local_dims, start, file_id, dataname, + last_dim_index, last_dim_start, last_dim_size, first_dim_start, + first_dim_size, write_start0_special); + } #endif - } - } - - } - else - mpi_one_fprintf(stderr, "unknown field type!\n"); - - if (file_id.id >= 0) { - matrixio_write_data_attr(file_id, "lattice vectors", - &output_R[0][0], 2, attr_dims); - matrixio_write_string_attr(file_id, "description", description); - - matrixio_close(file_id); - } - - /* We have destroyed curfield (by multiplying it by phases, - and/or reorganizing in the case of real-amplitude fields). */ - curfield_reset(); + } + } + } + else + mpi_one_fprintf(stderr, "unknown field type!\n"); + + if (file_id.id >= 0) { + matrixio_write_data_attr(file_id, "lattice vectors", &output_R[0][0], 2, attr_dims); + matrixio_write_string_attr(file_id, "description", description); + + matrixio_close(file_id); + } + + /* We have destroyed curfield (by multiplying it by phases, + and/or reorganizing in the case of real-amplitude fields). */ + curfield_reset(); } /**************************************************************************/ @@ -1302,236 +1205,224 @@ void output_field_to_file(integer which_component, string filename_prefix) that resides inside the given list of geometric objects. Later objects in the list have precedence, just like the ordinary geometry list. */ -number compute_energy_in_object_list(geometric_object_list objects) -{ - int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; +number compute_energy_in_object_list(geometric_object_list objects) { + int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif - real s1, s2, s3, c1, c2, c3; - real *energy = (real *) curfield; - real energy_sum = 0; - - if (!curfield || !strchr("DHBR", curfield_type)) { - mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); - return 0.0; - } - - geom_fix_objects0(objects); - - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - n_other = mdata->other_dims; - n_last = mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = mdata->last_dim; - rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - LOOP_XYZ(mdata) { - vector3 p; - int n; - p.x = i1 * s1 - c1; p.y = i2 * s2 - c2; p.z = i3 * s3 - c3; - for (n = objects.num_items - 1; n >= 0; --n) - if (point_in_periodic_fixed_objectp(p, objects.items[n])) { - if (objects.items[n].material.which_subclass - == MATERIAL_TYPE_SELF) - break; /* treat as a "nothing" object */ - energy_sum += energy[xyz_index]; + real s1, s2, s3, c1, c2, c3; + real *energy = (real *)curfield; + real energy_sum = 0; + + if (!curfield || !strchr("DHBR", curfield_type)) { + mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); + return 0.0; + } + + geom_fix_objects0(objects); + + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + n_other = mdata->other_dims; + n_last = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = mdata->last_dim; + rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + LOOP_XYZ(mdata) { + vector3 p; + int n; + p.x = i1 * s1 - c1; + p.y = i2 * s2 - c2; + p.z = i3 * s3 - c3; + for (n = objects.num_items - 1; n >= 0; --n) + if (point_in_periodic_fixed_objectp(p, objects.items[n])) { + if (objects.items[n].material.which_subclass == MATERIAL_TYPE_SELF) + break; /* treat as a "nothing" object */ + energy_sum += energy[xyz_index]; #ifndef SCALAR_COMPLEX - { - int last_index; -# ifdef HAVE_MPI - if (n3 == 1) - last_index = j + local_y_start; - else - last_index = k; -# else - last_index = j; -# endif - if (last_index != 0 && 2*last_index != last_dim) - energy_sum += energy[xyz_index]; - } + { + int last_index; +#ifdef HAVE_MPI + if (n3 == 1) + last_index = j + local_y_start; + else + last_index = k; +#else + last_index = j; #endif - break; - } - }}} - - mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - energy_sum *= Vol / H.N; - return energy_sum; + if (last_index != 0 && 2 * last_index != last_dim) energy_sum += energy[xyz_index]; + } +#endif + break; + } + } +} +} + +mpi_allreduce_1(&energy_sum, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +energy_sum *= Vol / H.N; +return energy_sum; } /**************************************************************************/ /* Compute the integral of f(energy/field, epsilon, r) over the cell. */ -cnumber compute_field_integral(function f) -{ - int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; +cnumber compute_field_integral(function f) { + int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif - real s1, s2, s3, c1, c2, c3; - int integrate_energy; - real *energy = (real *) curfield; - cnumber integral = {0,0}; - vector3 kvector = {0,0,0}; - - if (!curfield || !strchr("dhbeDHBRcv", curfield_type)) { - mpi_one_fprintf(stderr, "The D or H energy/field must be loaded first.\n"); - return integral; - } - if (curfield_type != 'v') - kvector = cur_kvector; - - integrate_energy = strchr("DHBR", curfield_type) != NULL; - - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - n_other = mdata->other_dims; - n_last = mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = mdata->last_dim; - rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - LOOP_XYZ(mdata) { - real epsilon; - vector3 p; - - epsilon = mean_medium_from_matrix(mdata->eps_inv + xyz_index); - - p.x = i1 * s1 - c1; p.y = i2 * s2 - c2; p.z = i3 * s3 - c3; - if (integrate_energy) { - integral.re += - ctl_convert_number_to_c( - gh_call3(f, - ctl_convert_number_to_scm(energy[xyz_index]), - ctl_convert_number_to_scm(epsilon), - ctl_convert_vector3_to_scm(p))); - } - else { - cvector3 F; - double phase_phi; - scalar_complex phase; - cnumber integrand; - - phase_phi = TWOPI * - (kvector.x * (p.x/geometry_lattice.size.x) + - kvector.y * (p.y/geometry_lattice.size.y) + - kvector.z * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT_RE(F.x.re, curfield[3*xyz_index+0], phase); - CASSIGN_MULT_IM(F.x.im, curfield[3*xyz_index+0], phase); - CASSIGN_MULT_RE(F.y.re, curfield[3*xyz_index+1], phase); - CASSIGN_MULT_IM(F.y.im, curfield[3*xyz_index+1], phase); - CASSIGN_MULT_RE(F.z.re, curfield[3*xyz_index+2], phase); - CASSIGN_MULT_IM(F.z.im, curfield[3*xyz_index+2], phase); - integrand = - ctl_convert_cnumber_to_c( - gh_call3(f, - ctl_convert_cvector3_to_scm(F), - ctl_convert_number_to_scm(epsilon), - ctl_convert_vector3_to_scm(p))); - integral.re += integrand.re; - integral.im += integrand.im; - } + real s1, s2, s3, c1, c2, c3; + int integrate_energy; + real *energy = (real *)curfield; + cnumber integral = {0, 0}; + vector3 kvector = {0, 0, 0}; + + if (!curfield || !strchr("dhbeDHBRcv", curfield_type)) { + mpi_one_fprintf(stderr, "The D or H energy/field must be loaded first.\n"); + return integral; + } + if (curfield_type != 'v') kvector = cur_kvector; + + integrate_energy = strchr("DHBR", curfield_type) != NULL; + + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + n_other = mdata->other_dims; + n_last = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = mdata->last_dim; + rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + LOOP_XYZ(mdata) { + real epsilon; + vector3 p; + + epsilon = mean_medium_from_matrix(mdata->eps_inv + xyz_index); + + p.x = i1 * s1 - c1; + p.y = i2 * s2 - c2; + p.z = i3 * s3 - c3; + if (integrate_energy) { + integral.re += ctl_convert_number_to_c( + gh_call3(f, ctl_convert_number_to_scm(energy[xyz_index]), + ctl_convert_number_to_scm(epsilon), ctl_convert_vector3_to_scm(p))); + } + else { + cvector3 F; + double phase_phi; + scalar_complex phase; + cnumber integrand; + + phase_phi = TWOPI * (kvector.x * (p.x / geometry_lattice.size.x) + + kvector.y * (p.y / geometry_lattice.size.y) + + kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT_RE(F.x.re, curfield[3 * xyz_index + 0], phase); + CASSIGN_MULT_IM(F.x.im, curfield[3 * xyz_index + 0], phase); + CASSIGN_MULT_RE(F.y.re, curfield[3 * xyz_index + 1], phase); + CASSIGN_MULT_IM(F.y.im, curfield[3 * xyz_index + 1], phase); + CASSIGN_MULT_RE(F.z.re, curfield[3 * xyz_index + 2], phase); + CASSIGN_MULT_IM(F.z.im, curfield[3 * xyz_index + 2], phase); + integrand = ctl_convert_cnumber_to_c(gh_call3(f, ctl_convert_cvector3_to_scm(F), + ctl_convert_number_to_scm(epsilon), + ctl_convert_vector3_to_scm(p))); + integral.re += integrand.re; + integral.im += integrand.im; + } #ifndef SCALAR_COMPLEX - { - int last_index; -# ifdef HAVE_MPI - if (n3 == 1) - last_index = j + local_y_start; - else - last_index = k; -# else - last_index = j; -# endif - - if (last_index != 0 && 2*last_index != last_dim) { - int i1c, i2c, i3c; - i1c = i1 ? (n1 - i1) : 0; - i2c = i2 ? (n2 - i2) : 0; - i3c = i3 ? (n3 - i3) : 0; - p.x = i1c * s1 - c1; - p.y = i2c * s2 - c2; - p.z = i3c * s3 - c3; - if (integrate_energy) - integral.re += - ctl_convert_number_to_c( - gh_call3(f, - ctl_convert_number_to_scm(energy[xyz_index]), - ctl_convert_number_to_scm(epsilon), - ctl_convert_vector3_to_scm(p))); - else { - cvector3 F; - double phase_phi; - scalar_complex phase, Fx, Fy, Fz; - cnumber integrand; - - Fx = curfield[3*xyz_index+0]; - Fy = curfield[3*xyz_index+1]; - Fz = curfield[3*xyz_index+2]; - Fx.im= -Fx.im; Fy.im= -Fy.im; Fz.im= -Fz.im; - - phase_phi = TWOPI * - (kvector.x - * (p.x/geometry_lattice.size.x) + - kvector.y - * (p.y/geometry_lattice.size.y) + - kvector.z - * (p.z/geometry_lattice.size.z)); - CASSIGN_SCALAR(phase, - cos(phase_phi), sin(phase_phi)); - CASSIGN_MULT_RE(F.x.re, Fx, phase); - CASSIGN_MULT_IM(F.x.im, Fx, phase); - CASSIGN_MULT_RE(F.y.re, Fy, phase); - CASSIGN_MULT_IM(F.y.im, Fy, phase); - CASSIGN_MULT_RE(F.z.re, Fz, phase); - CASSIGN_MULT_IM(F.z.im, Fz, phase); - - integrand = - ctl_convert_cnumber_to_c( - gh_call3(f, - ctl_convert_cvector3_to_scm(F), - ctl_convert_number_to_scm(epsilon), - ctl_convert_vector3_to_scm(p))); - integral.re += integrand.re; - integral.im += integrand.im; - } - } - } + { + int last_index; +#ifdef HAVE_MPI + if (n3 == 1) + last_index = j + local_y_start; + else + last_index = k; +#else + last_index = j; +#endif + + if (last_index != 0 && 2 * last_index != last_dim) { + int i1c, i2c, i3c; + i1c = i1 ? (n1 - i1) : 0; + i2c = i2 ? (n2 - i2) : 0; + i3c = i3 ? (n3 - i3) : 0; + p.x = i1c * s1 - c1; + p.y = i2c * s2 - c2; + p.z = i3c * s3 - c3; + if (integrate_energy) + integral.re += ctl_convert_number_to_c( + gh_call3(f, ctl_convert_number_to_scm(energy[xyz_index]), + ctl_convert_number_to_scm(epsilon), ctl_convert_vector3_to_scm(p))); + else { + cvector3 F; + double phase_phi; + scalar_complex phase, Fx, Fy, Fz; + cnumber integrand; + + Fx = curfield[3 * xyz_index + 0]; + Fy = curfield[3 * xyz_index + 1]; + Fz = curfield[3 * xyz_index + 2]; + Fx.im = -Fx.im; + Fy.im = -Fy.im; + Fz.im = -Fz.im; + + phase_phi = TWOPI * (kvector.x * (p.x / geometry_lattice.size.x) + + kvector.y * (p.y / geometry_lattice.size.y) + + kvector.z * (p.z / geometry_lattice.size.z)); + CASSIGN_SCALAR(phase, cos(phase_phi), sin(phase_phi)); + CASSIGN_MULT_RE(F.x.re, Fx, phase); + CASSIGN_MULT_IM(F.x.im, Fx, phase); + CASSIGN_MULT_RE(F.y.re, Fy, phase); + CASSIGN_MULT_IM(F.y.im, Fy, phase); + CASSIGN_MULT_RE(F.z.re, Fz, phase); + CASSIGN_MULT_IM(F.z.im, Fz, phase); + + integrand = ctl_convert_cnumber_to_c(gh_call3(f, ctl_convert_cvector3_to_scm(F), + ctl_convert_number_to_scm(epsilon), + ctl_convert_vector3_to_scm(p))); + integral.re += integrand.re; + integral.im += integrand.im; + } + } + } #endif - }}} - - integral.re *= Vol / H.N; - integral.im *= Vol / H.N; - { - cnumber integral_sum; - mpi_allreduce(&integral, &integral_sum, 2, number, - MPI_DOUBLE, MPI_SUM, mpb_comm); - return integral_sum; - } + } +} } -number compute_energy_integral(function f) +integral.re *= Vol / H.N; +integral.im *= Vol / H.N; { - if (!curfield || !strchr("DHBR", curfield_type)) { - mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); - return 0.0; - } + cnumber integral_sum; + mpi_allreduce(&integral, &integral_sum, 2, number, MPI_DOUBLE, MPI_SUM, mpb_comm); + return integral_sum; +} +} + +number compute_energy_integral(function f) { + if (!curfield || !strchr("DHBR", curfield_type)) { + mpi_one_fprintf(stderr, "The D or H energy density must be loaded first.\n"); + return 0.0; + } - return cnumber_re(compute_field_integral(f)); + return cnumber_re(compute_field_integral(f)); } /**************************************************************************/ diff --git a/mpb/material_grid.c b/mpb/material_grid.c index 5ad25c24..edf4e67e 100644 --- a/mpb/material_grid.c +++ b/mpb/material_grid.c @@ -29,8 +29,8 @@ #include "mpb.h" #ifdef HAVE_SCM_IS_ARRAY -# undef SCM_ARRAYP -# define SCM_ARRAYP scm_is_array +#undef SCM_ARRAYP +#define SCM_ARRAYP scm_is_array #endif /**************************************************************************/ @@ -44,25 +44,22 @@ #ifdef HAVE_SCM_ARRAY_GET_HANDLE static scm_t_array_handle cur_material_grid_array_handle; #endif -static double *material_grid_array(const material_grid *g) -{ +static double *material_grid_array(const material_grid *g) { #ifdef HAVE_SCM_ARRAY_GET_HANDLE - scm_array_get_handle(g->matgrid, &cur_material_grid_array_handle); - return (double *) scm_array_handle_uniform_writable_elements( - &cur_material_grid_array_handle); + scm_array_get_handle(g->matgrid, &cur_material_grid_array_handle); + return (double *)scm_array_handle_uniform_writable_elements(&cur_material_grid_array_handle); #else - CHECK(SCM_ARRAYP(g->matgrid), "bug: matgrid is not an array"); - return (double *) SCM_CELL_WORD_1(SCM_ARRAY_V(g->matgrid)); + CHECK(SCM_ARRAYP(g->matgrid), "bug: matgrid is not an array"); + return (double *)SCM_CELL_WORD_1(SCM_ARRAY_V(g->matgrid)); #endif } -static void material_grid_array_release(const material_grid *g) -{ +static void material_grid_array_release(const material_grid *g) { #ifdef HAVE_SCM_ARRAY_GET_HANDLE - (void) g; - scm_array_handle_release(&cur_material_grid_array_handle); + (void)g; + scm_array_handle_release(&cur_material_grid_array_handle); #else - (void) g; + (void)g; #endif } @@ -72,143 +69,125 @@ static void material_grid_array_release(const material_grid *g) in libguile/unif.c, but the alternative is a lot of overhead given that we know for certain that the material grid is a uniform 3d array of g->size double-precision values. */ -real material_grid_val(vector3 p, const material_grid *g) -{ - real val; - CHECK(SCM_ARRAYP(g->matgrid), "bug: matgrid is not an array"); - val = linear_interpolate(p.x, p.y, p.z, material_grid_array(g), - g->size.x, g->size.y, g->size.z, 1); - material_grid_array_release(g); - return val; +real material_grid_val(vector3 p, const material_grid *g) { + real val; + CHECK(SCM_ARRAYP(g->matgrid), "bug: matgrid is not an array"); + val = + linear_interpolate(p.x, p.y, p.z, material_grid_array(g), g->size.x, g->size.y, g->size.z, 1); + material_grid_array_release(g); + return val; } /* Returns true if m is a material grid and has the same epsilon min/max as mg, and the same kind, so that they are merged when they overlap */ -static int compatible_matgrids(const material_grid *mg, - const material_type *m) -{ - return (m->which_subclass == MATERIAL_GRID && - m->subclass.material_grid_data->material_grid_kind - == mg->material_grid_kind && - m->subclass.material_grid_data->epsilon_min == mg->epsilon_min && - m->subclass.material_grid_data->epsilon_max == mg->epsilon_max); +static int compatible_matgrids(const material_grid *mg, const material_type *m) { + return (m->which_subclass == MATERIAL_GRID && + m->subclass.material_grid_data->material_grid_kind == mg->material_grid_kind && + m->subclass.material_grid_data->epsilon_min == mg->epsilon_min && + m->subclass.material_grid_data->epsilon_max == mg->epsilon_max); } static int matgrid_val_count = 0; /* cache for gradient calculation */ -double matgrid_val(vector3 p, geom_box_tree tp, int oi, - const material_grid *mg) -{ - double uprod = 1.0, umin = 1.0, usum = 0.0, u; - matgrid_val_count = 0; - CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); - if (tp) { - do { - u = material_grid_val( - to_geom_box_coords(p, &tp->objects[oi]), - tp->objects[oi].o->material - .subclass.material_grid_data); - if (u < umin) umin = u; - uprod *= u; - usum += u; ++matgrid_val_count; - tp = geom_tree_search_next(p, tp, &oi); - } while (tp && - compatible_matgrids(mg, &tp->objects[oi].o->material)); - } - if (!tp && compatible_matgrids(mg, &default_material)) { - p.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; - p.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; - p.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; - u = material_grid_val(p, - default_material.subclass.material_grid_data); - if (u < umin) umin = u; - uprod *= u; - usum += u; ++matgrid_val_count; - } - return (mg->material_grid_kind == U_MIN ? umin - : (mg->material_grid_kind == U_PROD ? uprod - : usum / matgrid_val_count)); +double matgrid_val(vector3 p, geom_box_tree tp, int oi, const material_grid *mg) { + double uprod = 1.0, umin = 1.0, usum = 0.0, u; + matgrid_val_count = 0; + CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); + if (tp) { + do { + u = material_grid_val(to_geom_box_coords(p, &tp->objects[oi]), + tp->objects[oi].o->material.subclass.material_grid_data); + if (u < umin) umin = u; + uprod *= u; + usum += u; + ++matgrid_val_count; + tp = geom_tree_search_next(p, tp, &oi); + } while (tp && compatible_matgrids(mg, &tp->objects[oi].o->material)); + } + if (!tp && compatible_matgrids(mg, &default_material)) { + p.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; + p.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; + p.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; + u = material_grid_val(p, default_material.subclass.material_grid_data); + if (u < umin) umin = u; + uprod *= u; + usum += u; + ++matgrid_val_count; + } + return (mg->material_grid_kind == U_MIN + ? umin + : (mg->material_grid_kind == U_PROD ? uprod : usum / matgrid_val_count)); } /**************************************************************************/ -material_grid *get_material_grids(geometric_object_list g, int *ngrids) -{ - int i, nalloc = 0; - material_grid *grids = 0; - *ngrids = 0; - for (i = 0; i < g.num_items; ++i) - if (g.items[i].material.which_subclass == MATERIAL_GRID) { - int j; - for (j = 0; j < *ngrids; ++j) - if (material_grid_equal(&grids[j], - g.items[i].material.subclass - .material_grid_data)) - break; - if (j < *ngrids) continue; - if (j >= nalloc) { - nalloc = nalloc * 2 + 1; - grids = realloc(grids, sizeof(material_grid) * nalloc); - } - grids[j] = *g.items[i].material.subclass.material_grid_data; - ++*ngrids; - } - if (default_material.which_subclass == MATERIAL_GRID) { - int j; - for (j = 0; j < *ngrids; ++j) - if (material_grid_equal(&grids[j], - default_material.subclass - .material_grid_data)) - break; - if (j == *ngrids) { - if (j >= nalloc) { - nalloc = nalloc * 2 + 1; - grids = realloc(grids, sizeof(material_grid) * nalloc); - } - grids[j] = *default_material.subclass.material_grid_data; - ++*ngrids; - } - } - return grids; +material_grid *get_material_grids(geometric_object_list g, int *ngrids) { + int i, nalloc = 0; + material_grid *grids = 0; + *ngrids = 0; + for (i = 0; i < g.num_items; ++i) + if (g.items[i].material.which_subclass == MATERIAL_GRID) { + int j; + for (j = 0; j < *ngrids; ++j) + if (material_grid_equal(&grids[j], g.items[i].material.subclass.material_grid_data)) break; + if (j < *ngrids) continue; + if (j >= nalloc) { + nalloc = nalloc * 2 + 1; + grids = realloc(grids, sizeof(material_grid) * nalloc); + } + grids[j] = *g.items[i].material.subclass.material_grid_data; + ++*ngrids; + } + if (default_material.which_subclass == MATERIAL_GRID) { + int j; + for (j = 0; j < *ngrids; ++j) + if (material_grid_equal(&grids[j], default_material.subclass.material_grid_data)) break; + if (j == *ngrids) { + if (j >= nalloc) { + nalloc = nalloc * 2 + 1; + grids = realloc(grids, sizeof(material_grid) * nalloc); + } + grids[j] = *default_material.subclass.material_grid_data; + ++*ngrids; + } + } + return grids; } -int material_grids_ntot(const material_grid *grids, int ngrids) -{ - int i, ntot = 0; - for (i = 0; i < ngrids; ++i) - ntot += grids[i].size.x * grids[i].size.y * grids[i].size.z; - return ntot; +int material_grids_ntot(const material_grid *grids, int ngrids) { + int i, ntot = 0; + for (i = 0; i < ngrids; ++i) + ntot += grids[i].size.x * grids[i].size.y * grids[i].size.z; + return ntot; } /* note that you also need to call reset_epsilon() if you actually want to change the dielectric function */ -void material_grids_set(const double *u, material_grid *grids, int ngrids) -{ - int i, j = 0; - CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); - for (i = 0; i < ngrids; ++i) { - int ntot = grids[i].size.x * grids[i].size.y * grids[i].size.z; - double *a = material_grid_array(&grids[i]); - int k; - for (k = 0; k < ntot; ++k) - a[k] = u[j + k]; - material_grid_array_release(&grids[i]); - j += ntot; - } +void material_grids_set(const double *u, material_grid *grids, int ngrids) { + int i, j = 0; + CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); + for (i = 0; i < ngrids; ++i) { + int ntot = grids[i].size.x * grids[i].size.y * grids[i].size.z; + double *a = material_grid_array(&grids[i]); + int k; + for (k = 0; k < ntot; ++k) + a[k] = u[j + k]; + material_grid_array_release(&grids[i]); + j += ntot; + } } -void material_grids_get(double *u, const material_grid *grids, int ngrids) -{ - int i, j = 0; - CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); - for (i = 0; i < ngrids; ++i) { - int ntot = grids[i].size.x * grids[i].size.y * grids[i].size.z; - double *a = material_grid_array(&grids[i]); - int k; - for (k = 0; k < ntot; ++k) - u[j + k] = a[k]; - material_grid_array_release(&grids[i]); - j += ntot; - } +void material_grids_get(double *u, const material_grid *grids, int ngrids) { + int i, j = 0; + CHECK(sizeof(real) == sizeof(double), "material grids require double precision"); + for (i = 0; i < ngrids; ++i) { + int ntot = grids[i].size.x * grids[i].size.y * grids[i].size.z; + double *a = material_grid_array(&grids[i]); + int k; + for (k = 0; k < ntot; ++k) + u[j + k] = a[k]; + material_grid_array_release(&grids[i]); + j += ntot; + } } /**************************************************************************/ @@ -247,539 +226,530 @@ void material_grids_get(double *u, const material_grid *grids, int ngrids) function in epsilon_file.c) to data ... this has to be changed if linear_interpolate is changed!! ...also multiply by scaleby etc. for different gradient types */ -static void add_interpolate_weights(real rx, real ry, real rz, real *data, - int nx, int ny, int nz, int stride, - double scaleby, - const real *udata, - int ukind, double uval) -{ - int x, y, z, x2, y2, z2; - real dx, dy, dz, u; - - /* mirror boundary conditions for r just beyond the boundary */ - if (rx < 0.0) rx = -rx; else if (rx > 1.0) rx = 1.0 - rx; - if (ry < 0.0) ry = -ry; else if (ry > 1.0) ry = 1.0 - ry; - if (rz < 0.0) rz = -rz; else if (rz > 1.0) rz = 1.0 - rz; - - /* get the point corresponding to r in the epsilon array grid: */ - x = rx * nx; if (x == nx) --x; - y = ry * ny; if (y == ny) --y; - z = rz * nz; if (z == nz) --z; - - /* get the difference between (x,y,z) and the actual point - ... we shift by 0.5 to center the data points in the pixels */ - dx = rx * nx - x - 0.5; - dy = ry * ny - y - 0.5; - dz = rz * nz - z - 0.5; - - /* get the other closest point in the grid, with mirror boundaries: */ - x2 = (dx >= 0.0 ? x + 1 : x - 1); - if (x2 < 0) x2++; else if (x2 == nx) x2--; - y2 = (dy >= 0.0 ? y + 1 : y - 1); - if (y2 < 0) y2++; else if (y2 == ny) y2--; - z2 = (dz >= 0.0 ? z + 1 : z - 1); - if (z2 < 0) z2++; else if (z2 == nz) z2--; - - /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ - dx = fabs(dx); - dy = fabs(dy); - dz = fabs(dz); - - /* define a macro to give us data(x,y,z) on the grid, - in row-major order (the order used by HDF5): */ -#define D(x,y,z) (data[(((x)*ny + (y))*nz + (z)) * stride]) -#define U(x,y,z) (udata[(((x)*ny + (y))*nz + (z)) * stride]) - - u = (((U(x,y,z)*(1.0-dx) + U(x2,y,z)*dx) * (1.0-dy) + - (U(x,y2,z)*(1.0-dx) + U(x2,y2,z)*dx) * dy) * (1.0-dz) + - ((U(x,y,z2)*(1.0-dx) + U(x2,y,z2)*dx) * (1.0-dy) + - (U(x,y2,z2)*(1.0-dx) + U(x2,y2,z2)*dx) * dy) * dz); - - if (ukind == U_MIN && u != uval) return; - if (ukind == U_PROD) scaleby *= uval / u; - - D(x,y,z) += (1.0-dx) * (1.0-dy) * (1.0-dz) * scaleby; - D(x2,y,z) += dx * (1.0-dy) * (1.0-dz) * scaleby; - D(x,y2,z) += (1.0-dx) * dy * (1.0-dz) * scaleby; - D(x2,y2,z) += dx * dy * (1.0-dz) * scaleby; - D(x,y,z2) += (1.0-dx) * (1.0-dy) * dz * scaleby; - D(x2,y,z2) += dx * (1.0-dy) * dz * scaleby; - D(x,y2,z2) += (1.0-dx) * dy * dz * scaleby; - D(x2,y2,z2) += dx * dy * dz * scaleby; +static void add_interpolate_weights(real rx, real ry, real rz, real *data, int nx, int ny, int nz, + int stride, double scaleby, const real *udata, int ukind, + double uval) { + int x, y, z, x2, y2, z2; + real dx, dy, dz, u; + + /* mirror boundary conditions for r just beyond the boundary */ + if (rx < 0.0) + rx = -rx; + else if (rx > 1.0) + rx = 1.0 - rx; + if (ry < 0.0) + ry = -ry; + else if (ry > 1.0) + ry = 1.0 - ry; + if (rz < 0.0) + rz = -rz; + else if (rz > 1.0) + rz = 1.0 - rz; + + /* get the point corresponding to r in the epsilon array grid: */ + x = rx * nx; + if (x == nx) --x; + y = ry * ny; + if (y == ny) --y; + z = rz * nz; + if (z == nz) --z; + + /* get the difference between (x,y,z) and the actual point + ... we shift by 0.5 to center the data points in the pixels */ + dx = rx * nx - x - 0.5; + dy = ry * ny - y - 0.5; + dz = rz * nz - z - 0.5; + + /* get the other closest point in the grid, with mirror boundaries: */ + x2 = (dx >= 0.0 ? x + 1 : x - 1); + if (x2 < 0) + x2++; + else if (x2 == nx) + x2--; + y2 = (dy >= 0.0 ? y + 1 : y - 1); + if (y2 < 0) + y2++; + else if (y2 == ny) + y2--; + z2 = (dz >= 0.0 ? z + 1 : z - 1); + if (z2 < 0) + z2++; + else if (z2 == nz) + z2--; + + /* take abs(d{xyz}) to get weights for {xyz} and {xyz}2: */ + dx = fabs(dx); + dy = fabs(dy); + dz = fabs(dz); + + /* define a macro to give us data(x,y,z) on the grid, + in row-major order (the order used by HDF5): */ +#define D(x, y, z) (data[(((x)*ny + (y)) * nz + (z)) * stride]) +#define U(x, y, z) (udata[(((x)*ny + (y)) * nz + (z)) * stride]) + + u = (((U(x, y, z) * (1.0 - dx) + U(x2, y, z) * dx) * (1.0 - dy) + + (U(x, y2, z) * (1.0 - dx) + U(x2, y2, z) * dx) * dy) * + (1.0 - dz) + + ((U(x, y, z2) * (1.0 - dx) + U(x2, y, z2) * dx) * (1.0 - dy) + + (U(x, y2, z2) * (1.0 - dx) + U(x2, y2, z2) * dx) * dy) * + dz); + + if (ukind == U_MIN && u != uval) return; + if (ukind == U_PROD) scaleby *= uval / u; + + D(x, y, z) += (1.0 - dx) * (1.0 - dy) * (1.0 - dz) * scaleby; + D(x2, y, z) += dx * (1.0 - dy) * (1.0 - dz) * scaleby; + D(x, y2, z) += (1.0 - dx) * dy * (1.0 - dz) * scaleby; + D(x2, y2, z) += dx * dy * (1.0 - dz) * scaleby; + D(x, y, z2) += (1.0 - dx) * (1.0 - dy) * dz * scaleby; + D(x2, y, z2) += dx * (1.0 - dy) * dz * scaleby; + D(x, y2, z2) += (1.0 - dx) * dy * dz * scaleby; + D(x2, y2, z2) += dx * dy * dz * scaleby; #undef D } -static void material_grids_addgradient_point(double *v, - vector3 p, double scalegrad, - const material_grid *grids, - int ngrids) -{ - geom_box_tree tp; - int oi, i; - material_grid *mg; - double uval; - int kind; - - tp = geom_tree_search(p, geometry_tree, &oi); - if (tp && tp->objects[oi].o->material.which_subclass == MATERIAL_GRID) - mg = tp->objects[oi].o->material.subclass.material_grid_data; - else if (!tp && default_material.which_subclass == MATERIAL_GRID) - mg = default_material.subclass.material_grid_data; - else - return; /* no material grids at this point */ - - uval = matgrid_val(p, tp, oi, mg); - scalegrad *= (mg->epsilon_max - mg->epsilon_min); - if ((kind = mg->material_grid_kind) == U_SUM) - scalegrad /= matgrid_val_count; - - if (tp) { - do { - vector3 pb = to_geom_box_coords(p, &tp->objects[oi]); - vector3 sz = tp->objects[oi].o->material - .subclass.material_grid_data->size; - double *vcur = v, *ucur; - for (i = 0; i < ngrids; ++i) { - if (material_grid_equal(grids+i, - tp->objects[oi].o->material - .subclass.material_grid_data)) - break; - else - vcur += (int) (grids[i].size.x * grids[i].size.y - * grids[i].size.z); - } - CHECK(i < ngrids, "bug in material_grid_gradient_point"); - ucur = material_grid_array(grids+i); - add_interpolate_weights(pb.x, pb.y, pb.z, - vcur, sz.x, sz.y, sz.z, 1, scalegrad, - ucur, kind, uval); - material_grid_array_release(grids+i); - tp = geom_tree_search_next(p, tp, &oi); - } while (tp && - compatible_matgrids(mg, &tp->objects[oi].o->material)); - } - if (!tp && compatible_matgrids(mg, &default_material)) { - vector3 pb; - vector3 sz = default_material.subclass.material_grid_data->size; - double *vcur = v, *ucur; - for (i = 0; i < ngrids; ++i) { - if (material_grid_equal(grids+i, default_material - .subclass.material_grid_data)) - break; - else - vcur += (int) (grids[i].size.x * grids[i].size.y - * grids[i].size.z); - } - CHECK(i < ngrids, "bug in material_grid_gradient_point"); - pb.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; - pb.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; - pb.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; - ucur = material_grid_array(grids+i); - add_interpolate_weights(pb.x, pb.y, pb.z, - vcur, sz.x, sz.y, sz.z, 1, scalegrad, - ucur, kind, uval); - material_grid_array_release(grids+i); - } +static void material_grids_addgradient_point(double *v, vector3 p, double scalegrad, + const material_grid *grids, int ngrids) { + geom_box_tree tp; + int oi, i; + material_grid *mg; + double uval; + int kind; + + tp = geom_tree_search(p, geometry_tree, &oi); + if (tp && tp->objects[oi].o->material.which_subclass == MATERIAL_GRID) + mg = tp->objects[oi].o->material.subclass.material_grid_data; + else if (!tp && default_material.which_subclass == MATERIAL_GRID) + mg = default_material.subclass.material_grid_data; + else + return; /* no material grids at this point */ + + uval = matgrid_val(p, tp, oi, mg); + scalegrad *= (mg->epsilon_max - mg->epsilon_min); + if ((kind = mg->material_grid_kind) == U_SUM) scalegrad /= matgrid_val_count; + + if (tp) { + do { + vector3 pb = to_geom_box_coords(p, &tp->objects[oi]); + vector3 sz = tp->objects[oi].o->material.subclass.material_grid_data->size; + double *vcur = v, *ucur; + for (i = 0; i < ngrids; ++i) { + if (material_grid_equal(grids + i, tp->objects[oi].o->material.subclass.material_grid_data)) + break; + else + vcur += (int)(grids[i].size.x * grids[i].size.y * grids[i].size.z); + } + CHECK(i < ngrids, "bug in material_grid_gradient_point"); + ucur = material_grid_array(grids + i); + add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, kind, + uval); + material_grid_array_release(grids + i); + tp = geom_tree_search_next(p, tp, &oi); + } while (tp && compatible_matgrids(mg, &tp->objects[oi].o->material)); + } + if (!tp && compatible_matgrids(mg, &default_material)) { + vector3 pb; + vector3 sz = default_material.subclass.material_grid_data->size; + double *vcur = v, *ucur; + for (i = 0; i < ngrids; ++i) { + if (material_grid_equal(grids + i, default_material.subclass.material_grid_data)) + break; + else + vcur += (int)(grids[i].size.x * grids[i].size.y * grids[i].size.z); + } + CHECK(i < ngrids, "bug in material_grid_gradient_point"); + pb.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; + pb.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; + pb.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; + ucur = material_grid_array(grids + i); + add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, kind, + uval); + material_grid_array_release(grids + i); + } } -void material_grids_addgradient(double *v, - double scalegrad, int band, - const material_grid *grids, int ngrids) -{ - int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; +void material_grids_addgradient(double *v, double scalegrad, int band, const material_grid *grids, + int ngrids) { + int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif - real s1, s2, s3, c1, c2, c3; - real *Esqr; - - CHECK(band <= num_bands, "addgradient called for uncomputed band"); - if (band) { - scalegrad *= -freqs.items[band - 1]/2; - get_efield(band); - } - compute_field_squared(); - Esqr = (real *) curfield; - scalegrad *= Vol / H.N; - - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - n_other = mdata->other_dims; - n_last = mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = mdata->last_dim; - rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - LOOP_XYZ(mdata) { - vector3 p; - - p.x = i1 * s1 - c1; p.y = i2 * s2 - c2; p.z = i3 * s3 - c3; - - material_grids_addgradient_point( - v, p, Esqr[xyz_index]*scalegrad, grids,ngrids); + real s1, s2, s3, c1, c2, c3; + real *Esqr; + + CHECK(band <= num_bands, "addgradient called for uncomputed band"); + if (band) { + scalegrad *= -freqs.items[band - 1] / 2; + get_efield(band); + } + compute_field_squared(); + Esqr = (real *)curfield; + scalegrad *= Vol / H.N; + + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + n_other = mdata->other_dims; + n_last = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = mdata->last_dim; + rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + LOOP_XYZ(mdata) { + vector3 p; + + p.x = i1 * s1 - c1; + p.y = i2 * s2 - c2; + p.z = i3 * s3 - c3; + + material_grids_addgradient_point(v, p, Esqr[xyz_index] * scalegrad, grids, ngrids); #ifndef SCALAR_COMPLEX - { - int last_index; -# ifdef HAVE_MPI - if (n3 == 1) - last_index = j + local_y_start; - else - last_index = k; -# else - last_index = j; -# endif - - if (last_index != 0 && 2*last_index != last_dim) { - int i1c, i2c, i3c; - i1c = i1 ? (n1 - i1) : 0; - i2c = i2 ? (n2 - i2) : 0; - i3c = i3 ? (n3 - i3) : 0; - p.x = i1c * s1 - c1; - p.y = i2c * s2 - c2; - p.z = i3c * s3 - c3; - - material_grids_addgradient_point( - v, p, Esqr[xyz_index]*scalegrad, grids,ngrids); - } - } -#endif /* !SCALAR_COMPLEX */ + { + int last_index; +#ifdef HAVE_MPI + if (n3 == 1) + last_index = j + local_y_start; + else + last_index = k; +#else + last_index = j; +#endif - }}} + if (last_index != 0 && 2 * last_index != last_dim) { + int i1c, i2c, i3c; + i1c = i1 ? (n1 - i1) : 0; + i2c = i2 ? (n2 - i2) : 0; + i3c = i3 ? (n3 - i3) : 0; + p.x = i1c * s1 - c1; + p.y = i2c * s2 - c2; + p.z = i3c * s3 - c3; + + material_grids_addgradient_point(v, p, Esqr[xyz_index] * scalegrad, grids, ngrids); + } + } +#endif /* !SCALAR_COMPLEX */ + } +} +} } /**************************************************************************/ /* some routines mainly for debugging */ -void print_material_grids_gradient(integer band) -{ - int ngrids; - material_grid *grids = get_material_grids(geometry, &ngrids); - int i, ntot = material_grids_ntot(grids, ngrids); - double *grad = (double *) malloc(sizeof(double) * ntot); - for (i = 0; i < ntot; ++i) grad[i] = 0; - material_grids_addgradient(grad, 1.0, band, grids, ngrids); - for (i = 0; i < ntot; ++i) - mpi_one_printf(", %g", grad[i]); - free(grad); - free(grids); +void print_material_grids_gradient(integer band) { + int ngrids; + material_grid *grids = get_material_grids(geometry, &ngrids); + int i, ntot = material_grids_ntot(grids, ngrids); + double *grad = (double *)malloc(sizeof(double) * ntot); + for (i = 0; i < ntot; ++i) + grad[i] = 0; + material_grids_addgradient(grad, 1.0, band, grids, ngrids); + for (i = 0; i < ntot; ++i) + mpi_one_printf(", %g", grad[i]); + free(grad); + free(grids); } -number material_grids_approx_gradient(vector3 kpoint, integer band, - integer iu, number du) -{ - int ngrids; - material_grid *grids = get_material_grids(geometry, &ngrids); - int i, ntot = material_grids_ntot(grids, ngrids); - double *u = (double *) malloc(sizeof(double) * ntot); - double f0, f1, dfdu; - solve_kpoint(kpoint); - f0 = freqs.items[band-1]; - for (i = 0; i < ntot; ++i) u[i] = 0; - material_grids_addgradient(u, 1.0, band, grids, ngrids); - dfdu = u[iu]; - material_grids_get(u, grids, ngrids); - u[iu] += du; - material_grids_set(u, grids, ngrids); - reset_epsilon(); - solve_kpoint(kpoint); - f1 = freqs.items[band-1]; - u[iu] -= du; - material_grids_set(u, grids, ngrids); - reset_epsilon(); - mpi_one_printf("approxgrad: ntot=%d, u[%d] = %g -> f_%d = %g, u += %g -> f_%d = %g; df/du = %g vs. analytic %g\n", ntot, iu, u[iu], band, f0, du, band, f1, (f1-f0)/du, dfdu); - free(u); - free(grids); - return (f1 - f0) / du; +number material_grids_approx_gradient(vector3 kpoint, integer band, integer iu, number du) { + int ngrids; + material_grid *grids = get_material_grids(geometry, &ngrids); + int i, ntot = material_grids_ntot(grids, ngrids); + double *u = (double *)malloc(sizeof(double) * ntot); + double f0, f1, dfdu; + solve_kpoint(kpoint); + f0 = freqs.items[band - 1]; + for (i = 0; i < ntot; ++i) + u[i] = 0; + material_grids_addgradient(u, 1.0, band, grids, ngrids); + dfdu = u[iu]; + material_grids_get(u, grids, ngrids); + u[iu] += du; + material_grids_set(u, grids, ngrids); + reset_epsilon(); + solve_kpoint(kpoint); + f1 = freqs.items[band - 1]; + u[iu] -= du; + material_grids_set(u, grids, ngrids); + reset_epsilon(); + mpi_one_printf("approxgrad: ntot=%d, u[%d] = %g -> f_%d = %g, u += %g -> f_%d = %g; df/du = %g " + "vs. analytic %g\n", + ntot, iu, u[iu], band, f0, du, band, f1, (f1 - f0) / du, dfdu); + free(u); + free(grids); + return (f1 - f0) / du; } -void print_material_grids_deps_du(void) -{ - int i, j, k, n1, n2, n3; - real s1, s2, s3, c1, c2, c3; - int ngrids; - material_grid *grids = get_material_grids(geometry, &ngrids); - int ntot = material_grids_ntot(grids, ngrids); - double *v = (double *) malloc(sizeof(double) * ntot); - - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - /* Here we have different loops over the coordinates, depending - upon whether we are using complex or real and serial or - parallel transforms. Each loop must define, in its body, - variables (i2,j2,k2) describing the coordinate of the current - point, and "index" describing the corresponding index in - the curfield array. - - This was all stolen from fields.c...it would be better - if we didn't have to cut and paste, sigh. */ - - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int index = ((i * n2 + j) * n3 + k); - - { - vector3 p; - geom_box_tree tp; - int oi, ig; - material_grid *mg; - double uval; - int kind; - double scalegrad; - - memset(v, 0, sizeof(double) * ntot); - - p.x = i * s1 - c1; p.y = j * s2 - c2; p.z = k * s3 - c3; - - tp = geom_tree_search(p, geometry_tree, &oi); - if (tp && tp->objects[oi].o->material.which_subclass == MATERIAL_GRID) - mg = tp->objects[oi].o->material.subclass.material_grid_data; - else if (!tp && default_material.which_subclass == MATERIAL_GRID) - mg = default_material.subclass.material_grid_data; - else - goto gotmyv; /* no material grids at this point */ - - uval = matgrid_val(p, tp, oi, mg); - scalegrad = (mg->epsilon_max - mg->epsilon_min); - if ((kind = mg->material_grid_kind) == U_SUM) - scalegrad /= matgrid_val_count; - - if (tp) { - do { - vector3 pb = to_geom_box_coords(p, &tp->objects[oi]); - vector3 sz = tp->objects[oi].o->material - .subclass.material_grid_data->size; - double *vcur = v, *ucur; - for (ig = 0; ig < ngrids; ++ig) { - if (material_grid_equal(grids+ig, - tp->objects[oi].o->material - .subclass.material_grid_data)) - break; - else - vcur += (int) (grids[ig].size.x * grids[ig].size.y - * grids[ig].size.z); - } - CHECK(ig < ngrids, "bug in material_grid_gradient_point"); - ucur = material_grid_array(grids+ig); - add_interpolate_weights(pb.x, pb.y, pb.z, - vcur, sz.x, sz.y, sz.z, 1, scalegrad, - ucur, kind, uval); - material_grid_array_release(grids+ig); - tp = geom_tree_search_next(p, tp, &oi); - } while (tp && - compatible_matgrids(mg, &tp->objects[oi].o->material)); - } - if (!tp && compatible_matgrids(mg, &default_material)) { - vector3 pb; - vector3 sz = default_material.subclass.material_grid_data->size; - double *vcur = v, *ucur; - for (ig = 0; ig < ngrids; ++ig) { - if (material_grid_equal(grids+ig, default_material - .subclass.material_grid_data)) - break; - else - vcur += (int) (grids[ig].size.x * grids[ig].size.y - * grids[ig].size.z); - } - CHECK(ig < ngrids, "bug in material_grid_gradient_point"); - pb.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; - pb.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; - pb.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; - ucur = material_grid_array(grids+ig); - add_interpolate_weights(pb.x, pb.y, pb.z, - vcur, sz.x, sz.y, sz.z, 1, scalegrad, - ucur, kind, uval); - material_grid_array_release(grids+ig); - } - - gotmyv: - mpi_one_printf("depsdu:, %g, %d", - mean_medium_from_matrix(mdata->eps_inv + index), index); - for (ig = 0; ig < ntot; ++ig) - mpi_one_printf(", %g", v[ig]); - mpi_one_printf("\n"); - - - } - } - - free(v); +void print_material_grids_deps_du(void) { + int i, j, k, n1, n2, n3; + real s1, s2, s3, c1, c2, c3; + int ngrids; + material_grid *grids = get_material_grids(geometry, &ngrids); + int ntot = material_grids_ntot(grids, ngrids); + double *v = (double *)malloc(sizeof(double) * ntot); + + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + /* Here we have different loops over the coordinates, depending + upon whether we are using complex or real and serial or + parallel transforms. Each loop must define, in its body, + variables (i2,j2,k2) describing the coordinate of the current + point, and "index" describing the corresponding index in + the curfield array. + + This was all stolen from fields.c...it would be better + if we didn't have to cut and paste, sigh. */ + + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int index = ((i * n2 + j) * n3 + k); + + { + vector3 p; + geom_box_tree tp; + int oi, ig; + material_grid *mg; + double uval; + int kind; + double scalegrad; + + memset(v, 0, sizeof(double) * ntot); + + p.x = i * s1 - c1; + p.y = j * s2 - c2; + p.z = k * s3 - c3; + + tp = geom_tree_search(p, geometry_tree, &oi); + if (tp && tp->objects[oi].o->material.which_subclass == MATERIAL_GRID) + mg = tp->objects[oi].o->material.subclass.material_grid_data; + else if (!tp && default_material.which_subclass == MATERIAL_GRID) + mg = default_material.subclass.material_grid_data; + else + goto gotmyv; /* no material grids at this point */ + + uval = matgrid_val(p, tp, oi, mg); + scalegrad = (mg->epsilon_max - mg->epsilon_min); + if ((kind = mg->material_grid_kind) == U_SUM) scalegrad /= matgrid_val_count; + + if (tp) { + do { + vector3 pb = to_geom_box_coords(p, &tp->objects[oi]); + vector3 sz = tp->objects[oi].o->material.subclass.material_grid_data->size; + double *vcur = v, *ucur; + for (ig = 0; ig < ngrids; ++ig) { + if (material_grid_equal(grids + ig, + tp->objects[oi].o->material.subclass.material_grid_data)) + break; + else + vcur += (int)(grids[ig].size.x * grids[ig].size.y * grids[ig].size.z); + } + CHECK(ig < ngrids, "bug in material_grid_gradient_point"); + ucur = material_grid_array(grids + ig); + add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, + kind, uval); + material_grid_array_release(grids + ig); + tp = geom_tree_search_next(p, tp, &oi); + } while (tp && compatible_matgrids(mg, &tp->objects[oi].o->material)); + } + if (!tp && compatible_matgrids(mg, &default_material)) { + vector3 pb; + vector3 sz = default_material.subclass.material_grid_data->size; + double *vcur = v, *ucur; + for (ig = 0; ig < ngrids; ++ig) { + if (material_grid_equal(grids + ig, default_material.subclass.material_grid_data)) + break; + else + vcur += (int)(grids[ig].size.x * grids[ig].size.y * grids[ig].size.z); + } + CHECK(ig < ngrids, "bug in material_grid_gradient_point"); + pb.x = no_size_x ? 0 : p.x / geometry_lattice.size.x; + pb.y = no_size_y ? 0 : p.y / geometry_lattice.size.y; + pb.z = no_size_z ? 0 : p.z / geometry_lattice.size.z; + ucur = material_grid_array(grids + ig); + add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, + kind, uval); + material_grid_array_release(grids + ig); + } + + gotmyv: + mpi_one_printf("depsdu:, %g, %d", mean_medium_from_matrix(mdata->eps_inv + index), index); + for (ig = 0; ig < ntot; ++ig) + mpi_one_printf(", %g", v[ig]); + mpi_one_printf("\n"); + } + } + + free(v); } -void print_material_grids_deps_du_numeric(double du) -{ - int i, j, k; - int n1 = mdata->nx, n2 = mdata->ny, n3 = mdata->nz; - int ngrids; - material_grid *grids = get_material_grids(geometry, &ngrids); - int ntot = material_grids_ntot(grids, ngrids); - double *u = (double *) malloc(sizeof(double) * ntot); - double *v = (double *) malloc(sizeof(double) * (n1*n2*n3) * ntot); - double *ep = (double *) malloc(sizeof(double) * (n1*n2*n3)); - double *foo; - int iu; - - material_grids_get(u, grids, ngrids); - reset_epsilon(); - - ep[0] = 1.234; - - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int index = ((i * n2 + j) * n3 + k); - ep[index] = mean_medium_from_matrix(mdata->eps_inv + index); - } - - for (iu = 0; iu < ntot; ++iu) { - u[iu] += du; - material_grids_set(u, grids, ngrids); - reset_epsilon(); - - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int index = ((i * n2 + j) * n3 + k); - double epn = - mean_medium_from_matrix(mdata->eps_inv + index); - v[index*ntot + iu] = (epn - ep[index]) / du; - } - u[iu] -= du; - } - - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int index = ((i * n2 + j) * n3 + k); - mpi_one_printf("depsduN:, %g, %d", ep[index], index); - for (iu = 0; iu < ntot; ++iu) - mpi_one_printf(", %g", v[index*ntot + iu]); - mpi_one_printf("\n"); - } - - material_grids_set(u, grids, ngrids); - reset_epsilon(); - - free(ep); - free(v); - free(u); +void print_material_grids_deps_du_numeric(double du) { + int i, j, k; + int n1 = mdata->nx, n2 = mdata->ny, n3 = mdata->nz; + int ngrids; + material_grid *grids = get_material_grids(geometry, &ngrids); + int ntot = material_grids_ntot(grids, ngrids); + double *u = (double *)malloc(sizeof(double) * ntot); + double *v = (double *)malloc(sizeof(double) * (n1 * n2 * n3) * ntot); + double *ep = (double *)malloc(sizeof(double) * (n1 * n2 * n3)); + double *foo; + int iu; + + material_grids_get(u, grids, ngrids); + reset_epsilon(); + + ep[0] = 1.234; + + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int index = ((i * n2 + j) * n3 + k); + ep[index] = mean_medium_from_matrix(mdata->eps_inv + index); + } + + for (iu = 0; iu < ntot; ++iu) { + u[iu] += du; + material_grids_set(u, grids, ngrids); + reset_epsilon(); + + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int index = ((i * n2 + j) * n3 + k); + double epn = mean_medium_from_matrix(mdata->eps_inv + index); + v[index * ntot + iu] = (epn - ep[index]) / du; + } + u[iu] -= du; + } + + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int index = ((i * n2 + j) * n3 + k); + mpi_one_printf("depsduN:, %g, %d", ep[index], index); + for (iu = 0; iu < ntot; ++iu) + mpi_one_printf(", %g", v[index * ntot + iu]); + mpi_one_printf("\n"); + } + + material_grids_set(u, grids, ngrids); + reset_epsilon(); + + free(ep); + free(v); + free(u); } /**************************************************************************/ -static void synchronize_material_grid(material_grid *g) -{ - double *grid; - int n = ((int) g->size.x) * ((int) g->size.y) * ((int) g->size.z); - grid = material_grid_array(g); - MPI_Bcast(grid, n, MPI_DOUBLE, 0, mpb_comm); - material_grid_array_release(g); +static void synchronize_material_grid(material_grid *g) { + double *grid; + int n = ((int)g->size.x) * ((int)g->size.y) * ((int)g->size.z); + grid = material_grid_array(g); + MPI_Bcast(grid, n, MPI_DOUBLE, 0, mpb_comm); + material_grid_array_release(g); } -void randomize_material_gridB(material_grid g, number noise) -{ - if (mpi_is_master()) { - double *grid; - int i, n = ((int) g.size.x) * ((int) g.size.y) * ((int) g.size.z); - grid = material_grid_array(&g); - for (i = 0; i < n; ++i) { - double u = grid[i] + noise * (-1 + rand() * 2.0/RAND_MAX); - while (u < 0 || u > 1) { /* mirror boundary conditions */ - if (u > 1) u = 1 - u; - if (u < 0) u = -u; - } - grid[i] = u; - } - material_grid_array_release(&g); - } - synchronize_material_grid(&g); +void randomize_material_gridB(material_grid g, number noise) { + if (mpi_is_master()) { + double *grid; + int i, n = ((int)g.size.x) * ((int)g.size.y) * ((int)g.size.z); + grid = material_grid_array(&g); + for (i = 0; i < n; ++i) { + double u = grid[i] + noise * (-1 + rand() * 2.0 / RAND_MAX); + while (u < 0 || u > 1) { /* mirror boundary conditions */ + if (u > 1) u = 1 - u; + if (u < 0) u = -u; + } + grid[i] = u; + } + material_grid_array_release(&g); + } + synchronize_material_grid(&g); } /**************************************************************************/ -void save_material_grid(material_grid g, string filename) -{ - if (mpi_is_master()) { - matrixio_id file_id, data_id; - int dims[3], rank, start[3] = {0,0,0}; - double *grid; +void save_material_grid(material_grid g, string filename) { + if (mpi_is_master()) { + matrixio_id file_id, data_id; + int dims[3], rank, start[3] = {0, 0, 0}; + double *grid; - dims[0] = g.size.x; - dims[1] = g.size.y; - dims[2] = g.size.z; - rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; + dims[0] = g.size.x; + dims[1] = g.size.y; + dims[2] = g.size.z; + rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; - file_id = matrixio_create_serial(filename); - data_id = matrixio_create_dataset(file_id, "data", NULL, rank, dims); + file_id = matrixio_create_serial(filename); + data_id = matrixio_create_dataset(file_id, "data", NULL, rank, dims); - grid = material_grid_array(&g); - matrixio_write_real_data(data_id, dims, start, 1, grid); - material_grid_array_release(&g); + grid = material_grid_array(&g); + matrixio_write_real_data(data_id, dims, start, 1, grid); + material_grid_array_release(&g); - matrixio_close_dataset(data_id); - matrixio_close(file_id); - } + matrixio_close_dataset(data_id); + matrixio_close(file_id); + } } -void load_material_gridB(material_grid g, string filename, vector3 supercell) -{ - if (mpi_is_master()) { - matrixio_id file_id; - int dims[3] = {1,1,1}, rank = 3; - int nx, ny, nz, ix,iy,iz; - double *data, *grid; - double sx, sy, sz; - - file_id = matrixio_open_serial(filename, 1); - data = matrixio_read_real_data(file_id, "data", &rank,dims, 0,0,0,0); - CHECK(data, "couldn't find dataset in material grid file"); - matrixio_close(file_id); - - nx = g.size.x; - ny = g.size.y; - nz = g.size.z; - sx = supercell.x > 0 ? supercell.x : 1; - sy = supercell.y > 0 ? supercell.y : 1; - sz = supercell.z > 0 ? supercell.z : 1; - - grid = material_grid_array(&g); - for (ix = 0; ix < nx; ++ix) - for (iy = 0; iy < ny; ++iy) - for (iz = 0; iz < nz; ++iz) { - double dummy; - double x,y,z, val; - x = modf((ix + 0.5) * (sx / nx), &dummy); - y = modf((iy + 0.5) * (sy / ny), &dummy); - z = modf((iz + 0.5) * (sz / nz), &dummy); - val = linear_interpolate(x,y,z, data, - dims[0],dims[1],dims[2], 1); - if (val > 1) val = 1; - else if (val < 0) val = 0; - grid[(ix * ny + iy) * nz + iz] = val; - } - material_grid_array_release(&g); - - free(data); - } - synchronize_material_grid(&g); +void load_material_gridB(material_grid g, string filename, vector3 supercell) { + if (mpi_is_master()) { + matrixio_id file_id; + int dims[3] = {1, 1, 1}, rank = 3; + int nx, ny, nz, ix, iy, iz; + double *data, *grid; + double sx, sy, sz; + + file_id = matrixio_open_serial(filename, 1); + data = matrixio_read_real_data(file_id, "data", &rank, dims, 0, 0, 0, 0); + CHECK(data, "couldn't find dataset in material grid file"); + matrixio_close(file_id); + + nx = g.size.x; + ny = g.size.y; + nz = g.size.z; + sx = supercell.x > 0 ? supercell.x : 1; + sy = supercell.y > 0 ? supercell.y : 1; + sz = supercell.z > 0 ? supercell.z : 1; + + grid = material_grid_array(&g); + for (ix = 0; ix < nx; ++ix) + for (iy = 0; iy < ny; ++iy) + for (iz = 0; iz < nz; ++iz) { + double dummy; + double x, y, z, val; + x = modf((ix + 0.5) * (sx / nx), &dummy); + y = modf((iy + 0.5) * (sy / ny), &dummy); + z = modf((iz + 0.5) * (sz / nz), &dummy); + val = linear_interpolate(x, y, z, data, dims[0], dims[1], dims[2], 1); + if (val > 1) + val = 1; + else if (val < 0) + val = 0; + grid[(ix * ny + iy) * nz + iz] = val; + } + material_grid_array_release(&g); + + free(data); + } + synchronize_material_grid(&g); } /**************************************************************************/ @@ -793,252 +763,255 @@ void load_material_gridB(material_grid g, string filename, vector3 supercell) epsilon. */ #ifdef HAVE_NLOPT_H -# include +#include #endif typedef struct { - int eps_nx, eps_ny, eps_nz; - double *eps, *work; - material_grid *grids; - int ngrids; - int iter; + int eps_nx, eps_ny, eps_nz; + double *eps, *work; + material_grid *grids; + int ngrids; + int iter; } match_eps_data; -static double match_eps_func(int n, const double *u, double *grad, void *data) -{ - match_eps_data *d = (match_eps_data *) data; - double *eps = d->eps, *work = d->work; - int eps_nx = d->eps_nx, eps_ny = d->eps_ny, eps_nz = d->eps_nz; - material_grid *grids = d->grids; - int ngrids = d->ngrids; - double scaleby = 1.0 / H.N, val = 0; +static double match_eps_func(int n, const double *u, double *grad, void *data) { + match_eps_data *d = (match_eps_data *)data; + double *eps = d->eps, *work = d->work; + int eps_nx = d->eps_nx, eps_ny = d->eps_ny, eps_nz = d->eps_nz; + material_grid *grids = d->grids; + int ngrids = d->ngrids; + double scaleby = 1.0 / H.N, val = 0; - int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; + int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif - real s1, s2, s3, c1, c2, c3; - - material_grids_set(u, d->grids, d->ngrids); - reset_epsilon(); - if (grad) memset(work, 0, sizeof(double) * n); - d->iter++; - - n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; - n_other = mdata->other_dims; - n_last = mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); - last_dim = mdata->last_dim; - rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; - - s1 = geometry_lattice.size.x / n1; - s2 = geometry_lattice.size.y / n2; - s3 = geometry_lattice.size.z / n3; - c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; - c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; - c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; - - /* Here we have different loops over the coordinates, depending - upon whether we are using complex or real and serial or - parallel transforms. Each loop must define, in its body, - variables (i2,j2,k2) describing the coordinate of the current - point, and "index" describing the corresponding index in - the curfield array. - - This was all stolen from fields.c...it would be better - if we didn't have to cut and paste, sigh. */ + real s1, s2, s3, c1, c2, c3; + + material_grids_set(u, d->grids, d->ngrids); + reset_epsilon(); + if (grad) memset(work, 0, sizeof(double) * n); + d->iter++; + + n1 = mdata->nx; + n2 = mdata->ny; + n3 = mdata->nz; + n_other = mdata->other_dims; + n_last = mdata->last_dim_size / (sizeof(scalar_complex) / sizeof(scalar)); + last_dim = mdata->last_dim; + rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; + + s1 = geometry_lattice.size.x / n1; + s2 = geometry_lattice.size.y / n2; + s3 = geometry_lattice.size.z / n3; + c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; + c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; + c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; + + /* Here we have different loops over the coordinates, depending + upon whether we are using complex or real and serial or + parallel transforms. Each loop must define, in its body, + variables (i2,j2,k2) describing the coordinate of the current + point, and "index" describing the corresponding index in + the curfield array. + + This was all stolen from fields.c...it would be better + if we didn't have to cut and paste, sigh. */ #ifdef SCALAR_COMPLEX -# ifndef HAVE_MPI +#ifndef HAVE_MPI - for (i = 0; i < n1; ++i) - for (j = 0; j < n2; ++j) - for (k = 0; k < n3; ++k) - { - int i2 = i, j2 = j, k2 = k; - int index = ((i * n2 + j) * n3 + k); + for (i = 0; i < n1; ++i) + for (j = 0; j < n2; ++j) + for (k = 0; k < n3; ++k) { + int i2 = i, j2 = j, k2 = k; + int index = ((i * n2 + j) * n3 + k); -# else /* HAVE_MPI */ +#else /* HAVE_MPI */ - local_n2 = mdata->local_ny; - local_y_start = mdata->local_y_start; + local_n2 = mdata->local_ny; + local_y_start = mdata->local_y_start; - /* first two dimensions are transposed in MPI output: */ - for (j = 0; j < local_n2; ++j) - for (i = 0; i < n1; ++i) - for (k = 0; k < n3; ++k) - { - int i2 = i, j2 = j + local_y_start, k2 = k; - int index = ((j * n1 + i) * n3 + k); + /* first two dimensions are transposed in MPI output: */ + for (j = 0; j < local_n2; ++j) + for (i = 0; i < n1; ++i) + for (k = 0; k < n3; ++k) { + int i2 = i, j2 = j + local_y_start, k2 = k; + int index = ((j * n1 + i) * n3 + k); -# endif /* HAVE_MPI */ +#endif /* HAVE_MPI */ #else /* not SCALAR_COMPLEX */ -# ifndef HAVE_MPI - - for (i = 0; i < n_other; ++i) - for (j = 0; j < n_last; ++j) - { - int index = i * n_last + j; - int i2, j2, k2; - switch (rank) { - case 2: i2 = i; j2 = j; k2 = 0; break; - case 3: i2 = i / n2; j2 = i % n2; k2 = j; break; - default: i2 = j; j2 = k2 = 0; break; - } - -# else /* HAVE_MPI */ - - local_n2 = mdata->local_ny; - local_y_start = mdata->local_y_start; - - /* For a real->complex transform, the last dimension is cut in - half. For a 2d transform, this is taken into account in local_ny - already, but for a 3d transform we must compute the new n3: */ - if (n3 > 1) - local_n3 = mdata->last_dim_size / 2; - else - local_n3 = 1; - - /* first two dimensions are transposed in MPI output: */ - for (j = 0; j < local_n2; ++j) - for (i = 0; i < n1; ++i) - for (k = 0; k < local_n3; ++k) - { -# define i2 i - int j2 = j + local_y_start; -# define k2 k - int index = ((j * n1 + i) * local_n3 + k); - -# endif /* HAVE_MPI */ +#ifndef HAVE_MPI + + for (i = 0; i < n_other; ++i) + for (j = 0; j < n_last; ++j) { + int index = i * n_last + j; + int i2, j2, k2; + switch (rank) { + case 2: + i2 = i; + j2 = j; + k2 = 0; + break; + case 3: + i2 = i / n2; + j2 = i % n2; + k2 = j; + break; + default: + i2 = j; + j2 = k2 = 0; + break; + } + +#else /* HAVE_MPI */ + + local_n2 = mdata->local_ny; + local_y_start = mdata->local_y_start; + + /* For a real->complex transform, the last dimension is cut in + half. For a 2d transform, this is taken into account in local_ny + already, but for a 3d transform we must compute the new n3: */ + if (n3 > 1) + local_n3 = mdata->last_dim_size / 2; + else + local_n3 = 1; + + /* first two dimensions are transposed in MPI output: */ + for (j = 0; j < local_n2; ++j) + for (i = 0; i < n1; ++i) + for (k = 0; k < local_n3; ++k) { +#define i2 i + int j2 = j + local_y_start; +#define k2 k + int index = ((j * n1 + i) * local_n3 + k); + +#endif /* HAVE_MPI */ #endif /* not SCALAR_COMPLEX */ - { - real epsilon, eps0; - double scalegrad; - vector3 p; - - epsilon = mean_medium_from_matrix(mdata->eps_inv + index); - eps0 = linear_interpolate((i2 + 0.5) / n1, - (j2 + 0.5) / n2, - (k2 + 0.5) / n3, - eps, eps_nx, eps_ny, eps_nz, 1); - val += (epsilon - eps0) * (epsilon - eps0); - scalegrad = 2.0 * scaleby * (epsilon - eps0); - - if (grad) { - p.x = i2 * s1 - c1; p.y = j2 * s2 - c2; p.z = k2 * s3 - c3; - material_grids_addgradient_point(work, p, scalegrad, - grids, ngrids); - } + { + real epsilon, eps0; + double scalegrad; + vector3 p; + + epsilon = mean_medium_from_matrix(mdata->eps_inv + index); + eps0 = linear_interpolate((i2 + 0.5) / n1, (j2 + 0.5) / n2, (k2 + 0.5) / n3, eps, eps_nx, + eps_ny, eps_nz, 1); + val += (epsilon - eps0) * (epsilon - eps0); + scalegrad = 2.0 * scaleby * (epsilon - eps0); + + if (grad) { + p.x = i2 * s1 - c1; + p.y = j2 * s2 - c2; + p.z = k2 * s3 - c3; + material_grids_addgradient_point(work, p, scalegrad, grids, ngrids); + } #ifndef SCALAR_COMPLEX - { - int last_index; -# ifdef HAVE_MPI - if (n3 == 1) - last_index = j + local_y_start; - else - last_index = k; -# else - last_index = j; -# endif - - if (last_index != 0 && 2*last_index != last_dim) { - int i1c, i2c, i3c; - i1c = i2 ? (n1 - i2) : 0; - i2c = j2 ? (n2 - j2) : 0; - i3c = k2 ? (n3 - k2) : 0; - - eps0 = linear_interpolate((i1c + 0.5) / n1, - (i2c + 0.5) / n2, - (i3c + 0.5) / n3, eps, - eps_nx, eps_ny, eps_nz, 1); - val += (epsilon - eps0) * (epsilon - eps0); - - if (grad) { - p.x = i1c * s1 - c1; - p.y = i2c * s2 - c2; - p.z = i3c * s3 - c3; - - material_grids_addgradient_point(work, p, - scalegrad, - grids, ngrids); - } - } - } + { + int last_index; +#ifdef HAVE_MPI + if (n3 == 1) + last_index = j + local_y_start; + else + last_index = k; +#else + last_index = j; +#endif + + if (last_index != 0 && 2 * last_index != last_dim) { + int i1c, i2c, i3c; + i1c = i2 ? (n1 - i2) : 0; + i2c = j2 ? (n2 - j2) : 0; + i3c = k2 ? (n3 - k2) : 0; + + eps0 = linear_interpolate((i1c + 0.5) / n1, (i2c + 0.5) / n2, (i3c + 0.5) / n3, eps, + eps_nx, eps_ny, eps_nz, 1); + val += (epsilon - eps0) * (epsilon - eps0); + + if (grad) { + p.x = i1c * s1 - c1; + p.y = i2c * s2 - c2; + p.z = i3c * s3 - c3; + + material_grids_addgradient_point(work, p, scalegrad, grids, ngrids); + } + } + } #endif /* !SCALAR_COMPLEX */ - } - } - if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ - mpi_allreduce(work, grad, n, double, MPI_DOUBLE, - MPI_SUM, mpb_comm); - { - double valtmp = val * scaleby; - mpi_allreduce(&valtmp, &val, 1, double, MPI_DOUBLE, - MPI_SUM, mpb_comm); - } - mpi_one_printf("match-epsilon-file:, %d, %g\n", d->iter, sqrt(val)); - return val; + } + } + if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ + mpi_allreduce(work, grad, n, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + { + double valtmp = val * scaleby; + mpi_allreduce(&valtmp, &val, 1, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + } + mpi_one_printf("match-epsilon-file:, %d, %g\n", d->iter, sqrt(val)); + return val; } -void material_grids_match_epsilon_fileB(string filename, number eps_tol) -{ - int dims[3] = {1,1,1}, rank = 3; - matrixio_id file_id; - match_eps_data d; - int i, n, have_uprod; - double *u, *lb, *ub, *u_tol, func_min = 0; - - file_id = matrixio_open_serial(filename, 1); - d.eps = matrixio_read_real_data(file_id, NULL, &rank,dims, 0,0,0,0); - CHECK(d.eps, "couldn't find dataset in epsilon file"); - matrixio_close(file_id); - - d.eps_nx = dims[0]; - d.eps_ny = dims[1]; - d.eps_nz = dims[2]; - - d.grids = get_material_grids(geometry, &d.ngrids); - d.iter = 0; - - n = material_grids_ntot(d.grids, d.ngrids); - u = (double *) malloc(sizeof(double) * n * 5); - lb = u + n; ub = lb + n; u_tol = ub + n; d.work = u_tol + n; - - material_grids_get(u, d.grids, d.ngrids); - - for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i); - have_uprod = i < d.ngrids; - for (i = 0; i < n; ++i) { - ub[i] = 1; - u_tol[i] = eps_tol; - lb[i] = have_uprod ? 1e-4 : 0; - if (u[i] < lb[i]) u[i] = lb[i]; - } +void material_grids_match_epsilon_fileB(string filename, number eps_tol) { + int dims[3] = {1, 1, 1}, rank = 3; + matrixio_id file_id; + match_eps_data d; + int i, n, have_uprod; + double *u, *lb, *ub, *u_tol, func_min = 0; + + file_id = matrixio_open_serial(filename, 1); + d.eps = matrixio_read_real_data(file_id, NULL, &rank, dims, 0, 0, 0, 0); + CHECK(d.eps, "couldn't find dataset in epsilon file"); + matrixio_close(file_id); + + d.eps_nx = dims[0]; + d.eps_ny = dims[1]; + d.eps_nz = dims[2]; + + d.grids = get_material_grids(geometry, &d.ngrids); + d.iter = 0; + + n = material_grids_ntot(d.grids, d.ngrids); + u = (double *)malloc(sizeof(double) * n * 5); + lb = u + n; + ub = lb + n; + u_tol = ub + n; + d.work = u_tol + n; + + material_grids_get(u, d.grids, d.ngrids); + + for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i) + ; + have_uprod = i < d.ngrids; + for (i = 0; i < n; ++i) { + ub[i] = 1; + u_tol[i] = eps_tol; + lb[i] = have_uprod ? 1e-4 : 0; + if (u[i] < lb[i]) u[i] = lb[i]; + } #if defined(HAVE_NLOPT_H) && defined(HAVE_NLOPT) - { - nlopt_result res; - res = nlopt_minimize(NLOPT_LD_MMA, n, match_eps_func, &d, - lb, ub, u, &func_min, - -HUGE_VAL, 0,0, 0,u_tol, 0,0); - CHECK(res > 0, "failure of nlopt_minimize"); - } + { + nlopt_result res; + res = nlopt_minimize(NLOPT_LD_MMA, n, match_eps_func, &d, lb, ub, u, &func_min, -HUGE_VAL, 0, 0, + 0, u_tol, 0, 0); + CHECK(res > 0, "failure of nlopt_minimize"); + } #else - CHECK(0, "nlopt library is required for match-epsilon-file"); + CHECK(0, "nlopt library is required for match-epsilon-file"); #endif - material_grids_set(u, d.grids, d.ngrids); - reset_epsilon(); + material_grids_set(u, d.grids, d.ngrids); + reset_epsilon(); - mpi_one_printf("match-epsilon-file converged to %g after %d iterations\n", - sqrt(func_min), d.iter); + mpi_one_printf("match-epsilon-file converged to %g after %d iterations\n", sqrt(func_min), + d.iter); - free(u); - free(d.eps); + free(u); + free(d.eps); } /**************************************************************************/ diff --git a/mpb/material_grid_opt.c b/mpb/material_grid_opt.c index 1d8fcd0a..30689f4b 100644 --- a/mpb/material_grid_opt.c +++ b/mpb/material_grid_opt.c @@ -29,128 +29,118 @@ #include "mpb.h" #ifdef HAVE_NLOPT_H -# include +#include #endif /**************************************************************************/ /* minimizing the TE/TM difference in frequency */ typedef struct { - vector3 k; /* which k point */ - int b; /* which band */ - int ngrids; - material_grid *grids; - int iter; - struct maxwell_data *mdata1, *mdata2; - double *work; /* work array of length ntot */ + vector3 k; /* which k point */ + int b; /* which band */ + int ngrids; + material_grid *grids; + int iter; + struct maxwell_data *mdata1, *mdata2; + double *work; /* work array of length ntot */ } mindiff_func_data; -static double mindiff_func(int n, const double *u, double *grad, void *data) -{ - mindiff_func_data *d = (mindiff_func_data *) data; - double *work = d->work; - double gap, f1, f2; +static double mindiff_func(int n, const double *u, double *grad, void *data) { + mindiff_func_data *d = (mindiff_func_data *)data; + double *work = d->work; + double gap, f1, f2; - /* set the material grids, for use in the solver - and also for outputting in verbose mode */ - d->iter++; - material_grids_set(u, d->grids, d->ngrids); - reset_epsilon(); - if (grad) memset(work, 0, sizeof(double) * (n-2)); + /* set the material grids, for use in the solver + and also for outputting in verbose mode */ + d->iter++; + material_grids_set(u, d->grids, d->ngrids); + reset_epsilon(); + if (grad) memset(work, 0, sizeof(double) * (n - 2)); - set_maxwell_data_parity(mdata, EVEN_Z_PARITY); - randomize_fields(); - solve_kpoint(d->k); - gap = (f2 = freqs.items[d->b-1]); - if (grad) { - material_grids_addgradient(work, 1.0, d->b, - d->grids, d->ngrids); - } + set_maxwell_data_parity(mdata, EVEN_Z_PARITY); + randomize_fields(); + solve_kpoint(d->k); + gap = (f2 = freqs.items[d->b - 1]); + if (grad) { material_grids_addgradient(work, 1.0, d->b, d->grids, d->ngrids); } - set_maxwell_data_parity(mdata, ODD_Z_PARITY); - randomize_fields(); - solve_kpoint(d->k); - gap -= (f1 = freqs.items[d->b-1]); - if (grad) { - material_grids_addgradient(work, -1.0, d->b, - d->grids, d->ngrids); - } + set_maxwell_data_parity(mdata, ODD_Z_PARITY); + randomize_fields(); + solve_kpoint(d->k); + gap -= (f1 = freqs.items[d->b - 1]); + if (grad) { material_grids_addgradient(work, -1.0, d->b, d->grids, d->ngrids); } - if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ - mpi_allreduce(work, grad, n, double, MPI_DOUBLE, - MPI_SUM, mpb_comm); + if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ + mpi_allreduce(work, grad, n, double, MPI_DOUBLE, MPI_SUM, mpb_comm); - mpi_one_printf("material-grid-mindiff:, %d, %g, %g, %0.15g\n", - d->iter, f1, f2, gap); + mpi_one_printf("material-grid-mindiff:, %d, %g, %g, %0.15g\n", d->iter, f1, f2, gap); - return gap; + return gap; } -number material_grids_min_tetm_gap(vector3 kpoint, integer band, - number func_tol, number eps_tol, - integer maxeval, number maxtime) -{ - mindiff_func_data d; - int i, n; - double *u, *lb, *ub, *u_tol, func_min; - int have_uprod; - - CHECK(band <= num_bands, "invalid band number in material-grid-min-tetm-gap"); - d.k = kpoint; - d.b = band; - d.grids = get_material_grids(geometry, &d.ngrids); - d.iter = 0; - n = material_grids_ntot(d.grids, d.ngrids); - u = (double *) malloc(sizeof(double) * n * 5); - lb = u + n; ub = lb + n; u_tol = ub + n; d.work = u_tol + n; - - material_grids_get(u, d.grids, d.ngrids); - - for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i); - have_uprod = i < d.ngrids; - for (i = 0; i < n; ++i) { - ub[i] = 1; - u_tol[i] = eps_tol; - /* bound u slightly about 0 for uprod grids, as when u=0 - the gradient is problematic (especially for multiple u's = 0 */ - lb[i] = have_uprod ? 1e-4 : 0; - if (u[i] < lb[i]) u[i] = lb[i]; - } +number material_grids_min_tetm_gap(vector3 kpoint, integer band, number func_tol, number eps_tol, + integer maxeval, number maxtime) { + mindiff_func_data d; + int i, n; + double *u, *lb, *ub, *u_tol, func_min; + int have_uprod; + + CHECK(band <= num_bands, "invalid band number in material-grid-min-tetm-gap"); + d.k = kpoint; + d.b = band; + d.grids = get_material_grids(geometry, &d.ngrids); + d.iter = 0; + n = material_grids_ntot(d.grids, d.ngrids); + u = (double *)malloc(sizeof(double) * n * 5); + lb = u + n; + ub = lb + n; + u_tol = ub + n; + d.work = u_tol + n; + + material_grids_get(u, d.grids, d.ngrids); + + for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i) + ; + have_uprod = i < d.ngrids; + for (i = 0; i < n; ++i) { + ub[i] = 1; + u_tol[i] = eps_tol; + /* bound u slightly about 0 for uprod grids, as when u=0 + the gradient is problematic (especially for multiple u's = 0 */ + lb[i] = have_uprod ? 1e-4 : 0; + if (u[i] < lb[i]) u[i] = lb[i]; + } #if defined(HAVE_NLOPT_H) && defined(HAVE_NLOPT) - { - nlopt_result res; - res = nlopt_minimize_constrained( - NLOPT_LD_MMA, n, mindiff_func, &d, - 0, NULL, NULL, 0, - lb, ub, u, &func_min, - -HUGE_VAL, func_tol,0, 0,u_tol, maxeval,maxtime); - CHECK(res > 0, "failure of nlopt_minimize"); - } + { + nlopt_result res; + res = nlopt_minimize_constrained(NLOPT_LD_MMA, n, mindiff_func, &d, 0, NULL, NULL, 0, lb, ub, u, + &func_min, -HUGE_VAL, func_tol, 0, 0, u_tol, maxeval, maxtime); + CHECK(res > 0, "failure of nlopt_minimize"); + } #else - CHECK(0, "nlopt library is required for material-grid-maxgap"); + CHECK(0, "nlopt library is required for material-grid-maxgap"); #endif - func_min = mindiff_func(n, u, NULL, &d); + func_min = mindiff_func(n, u, NULL, &d); - free(u); - free(d.grids); + free(u); + free(d.grids); - return func_min; + return func_min; } /**************************************************************************/ /* optimization of band gaps as a function of the material grid */ typedef struct { - boolean do_min; - vector3_list ks; - int b1, b2; - int ngrids; - material_grid *grids; - int iter, unsolved; - double *f1s, *f2s; /* arrays of length ks.num_items for freqs */ - double *work; /* work array of length ntot */ + boolean do_min; + vector3_list ks; + int b1, b2; + int ngrids; + material_grid *grids; + int iter, unsolved; + double *f1s, *f2s; /* arrays of length ks.num_items for freqs */ + double *work; /* work array of length ntot */ } maxgap_func_data; /* the constraint is either an upper bound for band b1 @@ -158,228 +148,213 @@ typedef struct { typedef enum { BAND1_CONSTRAINT = 0, BAND2_CONSTRAINT = 1 } band_constraint_kind; typedef struct { - maxgap_func_data *d; - int ik; /* index of k point for this constraint (in d->ks) */ - band_constraint_kind kind; + maxgap_func_data *d; + int ik; /* index of k point for this constraint (in d->ks) */ + band_constraint_kind kind; } band_constraint_data; -static double band_constraint(int n, const double *u, double *grad, void *data) -{ - band_constraint_data *cdata = (band_constraint_data *) data; - maxgap_func_data *d = cdata->d; - int ik = cdata->ik; - int kind = cdata->kind; - double *work = d->work; - double val = 0; - - /* Strictly speaking, we should call material_grids_set here. However - we rely on an implementation detail of our MMA code: it always - evaluates the objective function before evaluating the constraints, - and hence we can set the material_grids once in the objective. */ - - /* We will typically have more than one band per k-point - (typically 2 bands), and we don't need to call solve_kpoint - more than once per band. Here we exploit the fact that our - MMA code always calls all the constraints at once (in - sequence); it never changes u in between one constraint & the next. */ - if (!vector3_equal(cur_kvector, d->ks.items[ik]) || d->unsolved) { - randomize_fields(); - solve_kpoint(d->ks.items[ik]); - } - d->unsolved = 0; - - if (grad) memset(work, 0, sizeof(double) * (n-2)); - if (kind == BAND1_CONSTRAINT) { - if (grad) { - material_grids_addgradient(work, 1.0, d->b1, - d->grids, d->ngrids); - grad[n-1] = -1; - grad[n-2] = 0; - } - val = (d->f1s[ik] = freqs.items[d->b1-1]) - u[n-1]; - } - else { - if (grad) { - material_grids_addgradient(work, -1.0, d->b2, - d->grids, d->ngrids); - grad[n-1] = 0; - grad[n-2] = 1; - } - val = u[n-2] - (d->f2s[ik] = freqs.items[d->b2-1]); - } - if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ - mpi_allreduce(work, grad, n-2, double, MPI_DOUBLE, - MPI_SUM, mpb_comm); - - return val; +static double band_constraint(int n, const double *u, double *grad, void *data) { + band_constraint_data *cdata = (band_constraint_data *)data; + maxgap_func_data *d = cdata->d; + int ik = cdata->ik; + int kind = cdata->kind; + double *work = d->work; + double val = 0; + + /* Strictly speaking, we should call material_grids_set here. However + we rely on an implementation detail of our MMA code: it always + evaluates the objective function before evaluating the constraints, + and hence we can set the material_grids once in the objective. */ + + /* We will typically have more than one band per k-point + (typically 2 bands), and we don't need to call solve_kpoint + more than once per band. Here we exploit the fact that our + MMA code always calls all the constraints at once (in + sequence); it never changes u in between one constraint & the next. */ + if (!vector3_equal(cur_kvector, d->ks.items[ik]) || d->unsolved) { + randomize_fields(); + solve_kpoint(d->ks.items[ik]); + } + d->unsolved = 0; + + if (grad) memset(work, 0, sizeof(double) * (n - 2)); + if (kind == BAND1_CONSTRAINT) { + if (grad) { + material_grids_addgradient(work, 1.0, d->b1, d->grids, d->ngrids); + grad[n - 1] = -1; + grad[n - 2] = 0; + } + val = (d->f1s[ik] = freqs.items[d->b1 - 1]) - u[n - 1]; + } + else { + if (grad) { + material_grids_addgradient(work, -1.0, d->b2, d->grids, d->ngrids); + grad[n - 1] = 0; + grad[n - 2] = 1; + } + val = u[n - 2] - (d->f2s[ik] = freqs.items[d->b2 - 1]); + } + if (grad) /* gradient w.r.t. epsilon needs to be summed over processes */ + mpi_allreduce(work, grad, n - 2, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + + return val; } -static double maxgap_func(int n, const double *u, double *grad, void *data) -{ - maxgap_func_data *d = (maxgap_func_data *) data; - double gap, f1 = u[n-1], f2 = u[n-2]; - - /* set the material grids, for use in the constraint functions - and also for outputting in verbose mode */ - material_grids_set(u, d->grids, d->ngrids); - reset_epsilon(); - d->iter++; - d->unsolved = 1; - - gap = (f2 - f1) * 2.0 / (f1 + f2); - - if (grad) { - memset(grad, 0, sizeof(double) * (n-2)); - grad[n-1] = 2.0 * ((f1 + f2) - (f1 - f2)) / ((f1+f2)*(f1+f2)); - grad[n-2] = 2.0 * (-(f1 + f2) - (f1 - f2)) / ((f1+f2)*(f1+f2)); - if (d->do_min) { - grad[n-1] = -grad[n-1]; - grad[n-2] = -grad[n-2]; - } - } - - mpi_one_printf("material-grid-%sgap:, %d, %g, %g, %0.15g\n", - d->do_min ? "min" : "max", d->iter, f1, f2, gap); - - if (verbose) { - char prefix[256]; - get_epsilon(); - snprintf(prefix, 256, "%sgap-%04d-", - d->do_min ? "min" : "max", d->iter); - output_field_to_file(-1, prefix); - } - - return d->do_min ? gap : -gap; +static double maxgap_func(int n, const double *u, double *grad, void *data) { + maxgap_func_data *d = (maxgap_func_data *)data; + double gap, f1 = u[n - 1], f2 = u[n - 2]; + + /* set the material grids, for use in the constraint functions + and also for outputting in verbose mode */ + material_grids_set(u, d->grids, d->ngrids); + reset_epsilon(); + d->iter++; + d->unsolved = 1; + + gap = (f2 - f1) * 2.0 / (f1 + f2); + + if (grad) { + memset(grad, 0, sizeof(double) * (n - 2)); + grad[n - 1] = 2.0 * ((f1 + f2) - (f1 - f2)) / ((f1 + f2) * (f1 + f2)); + grad[n - 2] = 2.0 * (-(f1 + f2) - (f1 - f2)) / ((f1 + f2) * (f1 + f2)); + if (d->do_min) { + grad[n - 1] = -grad[n - 1]; + grad[n - 2] = -grad[n - 2]; + } + } + + mpi_one_printf("material-grid-%sgap:, %d, %g, %g, %0.15g\n", d->do_min ? "min" : "max", d->iter, + f1, f2, gap); + + if (verbose) { + char prefix[256]; + get_epsilon(); + snprintf(prefix, 256, "%sgap-%04d-", d->do_min ? "min" : "max", d->iter); + output_field_to_file(-1, prefix); + } + + return d->do_min ? gap : -gap; } -static number material_grids_maxmin_gap(boolean do_min, - vector3_list kpoints, - integer band1, integer band2, - number func_tol, number eps_tol, - integer maxeval, number maxtime) -{ - maxgap_func_data d; - int i, n; - double *u, *lb, *ub, *u_tol, func_min; - band_constraint_data *cdata; - int have_uprod; - - CHECK(band1>0 && band1 <= num_bands && band2>0 && band2 <= num_bands, - "invalid band numbers in material-grid-maxgap"); - d.ks = kpoints; - d.b1 = band1; d.b2 = band2; - d.grids = get_material_grids(geometry, &d.ngrids); - d.iter = 0; - d.unsolved = 1; - d.do_min = do_min; - d.f1s = (double *) malloc(sizeof(double) * kpoints.num_items*2); - d.f2s = d.f1s + kpoints.num_items; - - n = material_grids_ntot(d.grids, d.ngrids) + 2; - u = (double *) malloc(sizeof(double) * n * 5); - lb = u + n; ub = lb + n; u_tol = ub + n; d.work = u_tol + n; - - material_grids_get(u, d.grids, d.ngrids); - u[n-1] = 0; /* band1 max */ - u[n-2] = HUGE_VAL; /* band2 min */ - - cdata = (band_constraint_data*) malloc(sizeof(band_constraint_data) - * kpoints.num_items*2); - for (i = 0; i < kpoints.num_items; ++i) { - band_constraint_kind kind; - for (kind = BAND1_CONSTRAINT; kind <= BAND2_CONSTRAINT; ++kind) { - cdata[2*i + kind].d = &d; - cdata[2*i + kind].ik = i; - cdata[2*i + kind].kind = kind; - - /* compute initial band min/max */ - band_constraint(n, u, NULL, &cdata[2*i + kind]); - if (kind == BAND1_CONSTRAINT && d.f1s[i] > u[n-1]) - u[n-1] = d.f1s[i]; - else if (kind == BAND2_CONSTRAINT && d.f2s[i] < u[n-2]) - u[n-2] = d.f2s[i]; - } - } - u[n-1] *= 1.001; u[n-2] /= 1.001; /* ensure feasibility of initial u */ - - for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i); - have_uprod = i < d.ngrids; - for (i = 0; i < n-2; ++i) { - ub[i] = 1; - u_tol[i] = eps_tol; - /* bound u slightly about 0 for uprod grids, as when u=0 - the gradient is problematic (especially for multiple u's = 0 */ - lb[i] = have_uprod ? 1e-4 : 0; - if (u[i] < lb[i]) u[i] = lb[i]; - } - u_tol[n-1] = u_tol[n-2] = 0; - lb[n-1] = lb[n-2] = 0; - ub[n-1] = ub[n-2] = HUGE_VAL; +static number material_grids_maxmin_gap(boolean do_min, vector3_list kpoints, integer band1, + integer band2, number func_tol, number eps_tol, + integer maxeval, number maxtime) { + maxgap_func_data d; + int i, n; + double *u, *lb, *ub, *u_tol, func_min; + band_constraint_data *cdata; + int have_uprod; + + CHECK(band1 > 0 && band1 <= num_bands && band2 > 0 && band2 <= num_bands, + "invalid band numbers in material-grid-maxgap"); + d.ks = kpoints; + d.b1 = band1; + d.b2 = band2; + d.grids = get_material_grids(geometry, &d.ngrids); + d.iter = 0; + d.unsolved = 1; + d.do_min = do_min; + d.f1s = (double *)malloc(sizeof(double) * kpoints.num_items * 2); + d.f2s = d.f1s + kpoints.num_items; + + n = material_grids_ntot(d.grids, d.ngrids) + 2; + u = (double *)malloc(sizeof(double) * n * 5); + lb = u + n; + ub = lb + n; + u_tol = ub + n; + d.work = u_tol + n; + + material_grids_get(u, d.grids, d.ngrids); + u[n - 1] = 0; /* band1 max */ + u[n - 2] = HUGE_VAL; /* band2 min */ + + cdata = (band_constraint_data *)malloc(sizeof(band_constraint_data) * kpoints.num_items * 2); + for (i = 0; i < kpoints.num_items; ++i) { + band_constraint_kind kind; + for (kind = BAND1_CONSTRAINT; kind <= BAND2_CONSTRAINT; ++kind) { + cdata[2 * i + kind].d = &d; + cdata[2 * i + kind].ik = i; + cdata[2 * i + kind].kind = kind; + + /* compute initial band min/max */ + band_constraint(n, u, NULL, &cdata[2 * i + kind]); + if (kind == BAND1_CONSTRAINT && d.f1s[i] > u[n - 1]) + u[n - 1] = d.f1s[i]; + else if (kind == BAND2_CONSTRAINT && d.f2s[i] < u[n - 2]) + u[n - 2] = d.f2s[i]; + } + } + u[n - 1] *= 1.001; + u[n - 2] /= 1.001; /* ensure feasibility of initial u */ + + for (i = 0; i < d.ngrids && d.grids[i].material_grid_kind != U_PROD; ++i) + ; + have_uprod = i < d.ngrids; + for (i = 0; i < n - 2; ++i) { + ub[i] = 1; + u_tol[i] = eps_tol; + /* bound u slightly about 0 for uprod grids, as when u=0 + the gradient is problematic (especially for multiple u's = 0 */ + lb[i] = have_uprod ? 1e-4 : 0; + if (u[i] < lb[i]) u[i] = lb[i]; + } + u_tol[n - 1] = u_tol[n - 2] = 0; + lb[n - 1] = lb[n - 2] = 0; + ub[n - 1] = ub[n - 2] = HUGE_VAL; #if defined(HAVE_NLOPT_H) && defined(HAVE_NLOPT) - { - nlopt_result res; - extern int mma_verbose; - mma_verbose = kpoints.num_items*2; - res = nlopt_minimize_constrained( - NLOPT_LD_MMA, n, maxgap_func, &d, - kpoints.num_items*2, band_constraint, - cdata, sizeof(band_constraint_data), - lb, ub, u, &func_min, - -HUGE_VAL, func_tol,0, 0,u_tol, maxeval,maxtime); - CHECK(res > 0, "failure of nlopt_minimize"); - } + { + nlopt_result res; + extern int mma_verbose; + mma_verbose = kpoints.num_items * 2; + res = + nlopt_minimize_constrained(NLOPT_LD_MMA, n, maxgap_func, &d, kpoints.num_items * 2, + band_constraint, cdata, sizeof(band_constraint_data), lb, ub, u, + &func_min, -HUGE_VAL, func_tol, 0, 0, u_tol, maxeval, maxtime); + CHECK(res > 0, "failure of nlopt_minimize"); + } #else - CHECK(0, "nlopt library is required for material-grid-maxgap"); + CHECK(0, "nlopt library is required for material-grid-maxgap"); #endif - maxgap_func(n, u, NULL, &d); - - /* recompute bands and get actual gap size */ - - u[n-1] = 0; /* band1 max */ - u[n-2] = HUGE_VAL; /* band2 min */ - for (i = 0; i < kpoints.num_items; ++i) { - band_constraint_kind kind; - for (kind = BAND1_CONSTRAINT; kind <= BAND2_CONSTRAINT; ++kind) { - band_constraint(n, u, NULL, &cdata[2*i + kind]); - if (kind == BAND1_CONSTRAINT && d.f1s[i] > u[n-1]) - u[n-1] = d.f1s[i]; - else if (kind == BAND2_CONSTRAINT && d.f2s[i] < u[n-2]) - u[n-2] = d.f2s[i]; - } - } - - func_min = (u[n-2] - u[n-1]) * 2.0 / (u[n-1] + u[n-2]); - mpi_one_printf("material-grid-%sgap:, %d, %g, %g, %0.15g\n", - d.do_min ? "min" : "max", d.iter+1, - u[n-1], u[n-2], func_min); - func_min = d.do_min ? func_min : -func_min; - - free(cdata); - free(u); - free(d.grids); - free(d.f1s); - - return(do_min ? func_min : -func_min); + maxgap_func(n, u, NULL, &d); + + /* recompute bands and get actual gap size */ + + u[n - 1] = 0; /* band1 max */ + u[n - 2] = HUGE_VAL; /* band2 min */ + for (i = 0; i < kpoints.num_items; ++i) { + band_constraint_kind kind; + for (kind = BAND1_CONSTRAINT; kind <= BAND2_CONSTRAINT; ++kind) { + band_constraint(n, u, NULL, &cdata[2 * i + kind]); + if (kind == BAND1_CONSTRAINT && d.f1s[i] > u[n - 1]) + u[n - 1] = d.f1s[i]; + else if (kind == BAND2_CONSTRAINT && d.f2s[i] < u[n - 2]) + u[n - 2] = d.f2s[i]; + } + } + + func_min = (u[n - 2] - u[n - 1]) * 2.0 / (u[n - 1] + u[n - 2]); + mpi_one_printf("material-grid-%sgap:, %d, %g, %g, %0.15g\n", d.do_min ? "min" : "max", d.iter + 1, + u[n - 1], u[n - 2], func_min); + func_min = d.do_min ? func_min : -func_min; + + free(cdata); + free(u); + free(d.grids); + free(d.f1s); + + return (do_min ? func_min : -func_min); } -number material_grids_maxgap(vector3_list kpoints, - integer band1, integer band2, - number func_tol, number eps_tol, - integer maxeval, number maxtime) -{ - return material_grids_maxmin_gap(0, kpoints, band1, band2, - func_tol, eps_tol, maxeval, maxtime); +number material_grids_maxgap(vector3_list kpoints, integer band1, integer band2, number func_tol, + number eps_tol, integer maxeval, number maxtime) { + return material_grids_maxmin_gap(0, kpoints, band1, band2, func_tol, eps_tol, maxeval, maxtime); } -number material_grids_mingap(vector3_list kpoints, - integer band1, integer band2, - number func_tol, number eps_tol, - integer maxeval, number maxtime) -{ - return material_grids_maxmin_gap(1, kpoints, band1, band2, - func_tol, eps_tol, maxeval, maxtime); +number material_grids_mingap(vector3_list kpoints, integer band1, integer band2, number func_tol, + number eps_tol, integer maxeval, number maxtime) { + return material_grids_maxmin_gap(1, kpoints, band1, band2, func_tol, eps_tol, maxeval, maxtime); } /**************************************************************************/ diff --git a/mpb/matrix-smob.c b/mpb/matrix-smob.c index fb4a2711..18696a1c 100644 --- a/mpb/matrix-smob.c +++ b/mpb/matrix-smob.c @@ -32,46 +32,44 @@ #ifndef HAVE_SCM_MAKE_SMOB_TYPE /* null mark function, for smobs containing no SCM objects */ -static SCM mark_null(SCM obj) { (void) obj; return SCM_BOOL_F; } +static SCM mark_null(SCM obj) { + (void)obj; + return SCM_BOOL_F; +} #endif /*************************************************************************/ long scm_tc16_smob_evectmatrix = 0; -static SCM evectmatrix_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(EVECTMATRIX_P(obj)); -} +static SCM evectmatrix_p(SCM obj) { return ctl_convert_boolean_to_scm(EVECTMATRIX_P(obj)); } -static int print_evectmatrix(SCM obj, SCM port, scm_print_state *pstate) -{ - char buf[256]; - evectmatrix *pm = EVECTMATRIX(obj); - (void) pstate; /* unused argument */ +static int print_evectmatrix(SCM obj, SCM port, scm_print_state *pstate) { + char buf[256]; + evectmatrix *pm = EVECTMATRIX(obj); + (void)pstate; /* unused argument */ - scm_puts("#N, pm->c, pm->p); - scm_puts(buf, port); + scm_puts("#N, pm->c, pm->p); + scm_puts(buf, port); #ifdef SCALAR_COMPLEX - scm_puts(" complex", port); + scm_puts(" complex", port); #else - scm_puts(" real", port); + scm_puts(" real", port); #endif - if (pm->localN < pm->N) { - sprintf(buf, ", (%dx%d)x%d local", pm->localN, pm->c, pm->p); - scm_puts(buf, port); - } - scm_putc('>', port); - return 1; + if (pm->localN < pm->N) { + sprintf(buf, ", (%dx%d)x%d local", pm->localN, pm->c, pm->p); + scm_puts(buf, port); + } + scm_putc('>', port); + return 1; } -static size_t free_evectmatrix(SCM obj) -{ - evectmatrix *pm = EVECTMATRIX(obj); - destroy_evectmatrix(*pm); - free(pm); - return 0; +static size_t free_evectmatrix(SCM obj) { + evectmatrix *pm = EVECTMATRIX(obj); + destroy_evectmatrix(*pm); + free(pm); + return 0; } #define mark_evectmatrix mark_null @@ -80,39 +78,34 @@ static size_t free_evectmatrix(SCM obj) long scm_tc16_smob_sqmatrix = 0; -static SCM sqmatrix_p(SCM obj) -{ - return ctl_convert_boolean_to_scm(SQMATRIX_P(obj)); -} +static SCM sqmatrix_p(SCM obj) { return ctl_convert_boolean_to_scm(SQMATRIX_P(obj)); } -static int print_sqmatrix(SCM obj, SCM port, scm_print_state *pstate) -{ - char buf[256]; - sqmatrix *pm = SQMATRIX(obj); - (void) pstate; /* unused argument */ +static int print_sqmatrix(SCM obj, SCM port, scm_print_state *pstate) { + char buf[256]; + sqmatrix *pm = SQMATRIX(obj); + (void)pstate; /* unused argument */ - scm_puts("#p, pm->p); - scm_puts(buf, port); + scm_puts("#p, pm->p); + scm_puts(buf, port); #ifdef SCALAR_COMPLEX - scm_puts(" complex", port); + scm_puts(" complex", port); #else - scm_puts(" real", port); + scm_puts(" real", port); #endif - if (pm->alloc_p > pm->p) { - sprintf(buf, ", %dx%d alloc", pm->alloc_p, pm->alloc_p); - scm_puts(buf, port); - } - scm_putc('>', port); - return 1; + if (pm->alloc_p > pm->p) { + sprintf(buf, ", %dx%d alloc", pm->alloc_p, pm->alloc_p); + scm_puts(buf, port); + } + scm_putc('>', port); + return 1; } -static size_t free_sqmatrix(SCM obj) -{ - sqmatrix *pm = SQMATRIX(obj); - destroy_sqmatrix(*pm); - free(pm); - return 0; +static size_t free_sqmatrix(SCM obj) { + sqmatrix *pm = SQMATRIX(obj); + destroy_sqmatrix(*pm); + free(pm); + return 0; } #define mark_sqmatrix mark_null @@ -120,257 +113,233 @@ static size_t free_sqmatrix(SCM obj) /*************************************************************************/ /* return a Scheme object *copy* of m */ -SCM evectmatrix2scm(evectmatrix m) -{ - SCM obj; - evectmatrix *mp; - CHK_MALLOC(mp, evectmatrix, 1); - *mp = create_evectmatrix(m.N, m.c, m.p, m.localN, m.Nstart, m.allocN); - evectmatrix_copy(*mp, m); - NEWCELL_SMOB(obj, evectmatrix, mp); - return obj; +SCM evectmatrix2scm(evectmatrix m) { + SCM obj; + evectmatrix *mp; + CHK_MALLOC(mp, evectmatrix, 1); + *mp = create_evectmatrix(m.N, m.c, m.p, m.localN, m.Nstart, m.allocN); + evectmatrix_copy(*mp, m); + NEWCELL_SMOB(obj, evectmatrix, mp); + return obj; } /* return a Scheme object *copy* of the given columns of m */ -SCM evectmatrix_slice2scm(evectmatrix m, int p_start, int p) -{ - SCM obj; - evectmatrix *mp; - CHECK(p_start >= 0 && p_start + p <= m.p && p >= 0, - "invalid arguments in evectmatrix_slice2scm"); - CHK_MALLOC(mp, evectmatrix, 1); - *mp = create_evectmatrix(m.N, m.c, p, m.localN, m.Nstart, m.allocN); - evectmatrix_copy_slice(*mp, m, 0, p_start, p); - NEWCELL_SMOB(obj, evectmatrix, mp); - return obj; +SCM evectmatrix_slice2scm(evectmatrix m, int p_start, int p) { + SCM obj; + evectmatrix *mp; + CHECK(p_start >= 0 && p_start + p <= m.p && p >= 0, "invalid arguments in evectmatrix_slice2scm"); + CHK_MALLOC(mp, evectmatrix, 1); + *mp = create_evectmatrix(m.N, m.c, p, m.localN, m.Nstart, m.allocN); + evectmatrix_copy_slice(*mp, m, 0, p_start, p); + NEWCELL_SMOB(obj, evectmatrix, mp); + return obj; } -SCM sqmatrix2scm(sqmatrix m) -{ - SCM obj; - sqmatrix *mp; - CHK_MALLOC(mp, sqmatrix, 1); - *mp = create_sqmatrix(m.p); - sqmatrix_copy(*mp, m); - NEWCELL_SMOB(obj, sqmatrix, mp); - return obj; +SCM sqmatrix2scm(sqmatrix m) { + SCM obj; + sqmatrix *mp; + CHK_MALLOC(mp, sqmatrix, 1); + *mp = create_sqmatrix(m.p); + sqmatrix_copy(*mp, m); + NEWCELL_SMOB(obj, sqmatrix, mp); + return obj; } /*************************************************************************/ -void register_matrix_smobs(void) -{ +void register_matrix_smobs(void) { #ifdef HAVE_SCM_MAKE_SMOB_TYPE - scm_tc16_smob_evectmatrix = scm_make_smob_type("evectmatrix", 0); - scm_set_smob_free(scm_tc16_smob_evectmatrix, free_evectmatrix); - scm_set_smob_print(scm_tc16_smob_evectmatrix, print_evectmatrix); + scm_tc16_smob_evectmatrix = scm_make_smob_type("evectmatrix", 0); + scm_set_smob_free(scm_tc16_smob_evectmatrix, free_evectmatrix); + scm_set_smob_print(scm_tc16_smob_evectmatrix, print_evectmatrix); - scm_tc16_smob_sqmatrix = scm_make_smob_type("sqmatrix", 0); - scm_set_smob_free(scm_tc16_smob_sqmatrix, free_sqmatrix); - scm_set_smob_print(scm_tc16_smob_sqmatrix, print_sqmatrix); + scm_tc16_smob_sqmatrix = scm_make_smob_type("sqmatrix", 0); + scm_set_smob_free(scm_tc16_smob_sqmatrix, free_sqmatrix); + scm_set_smob_print(scm_tc16_smob_sqmatrix, print_sqmatrix); #else /* old way to register smobs */ - MAKE_SMOBFUNS(evectmatrix); - MAKE_SMOBFUNS(sqmatrix); - REGISTER_SMOBFUNS(evectmatrix); - REGISTER_SMOBFUNS(sqmatrix); + MAKE_SMOBFUNS(evectmatrix); + MAKE_SMOBFUNS(sqmatrix); + REGISTER_SMOBFUNS(evectmatrix); + REGISTER_SMOBFUNS(sqmatrix); #endif - gh_new_procedure("evectmatrix?", evectmatrix_p, 1, 0, 0); - gh_new_procedure("sqmatrix?", sqmatrix_p, 1, 0, 0); + gh_new_procedure("evectmatrix?", evectmatrix_p, 1, 0, 0); + gh_new_procedure("sqmatrix?", sqmatrix_p, 1, 0, 0); } /*************************************************************************/ -sqmatrix *assert_sqmatrix_smob(SCM mo) -{ - sqmatrix *m = SAFE_SQMATRIX(mo); - CHECK(m, "wrong type argument: expecting sqmatrix"); - return m; +sqmatrix *assert_sqmatrix_smob(SCM mo) { + sqmatrix *m = SAFE_SQMATRIX(mo); + CHECK(m, "wrong type argument: expecting sqmatrix"); + return m; } -evectmatrix *assert_evectmatrix_smob(SCM mo) -{ - evectmatrix *m = SAFE_EVECTMATRIX(mo); - CHECK(m, "wrong type argument: expecting evectmatrix"); - return m; +evectmatrix *assert_evectmatrix_smob(SCM mo) { + evectmatrix *m = SAFE_EVECTMATRIX(mo); + CHECK(m, "wrong type argument: expecting evectmatrix"); + return m; } /*************************************************************************/ -integer sqmatrix_size(SCM mo) -{ - sqmatrix *m = assert_sqmatrix_smob(mo); - return m->p; +integer sqmatrix_size(SCM mo) { + sqmatrix *m = assert_sqmatrix_smob(mo); + return m->p; } -cnumber sqmatrix_ref(SCM mo, integer i, integer j) -{ - sqmatrix *m = assert_sqmatrix_smob(mo); - cnumber c; - CHECK(m && i >= 0 && j >= 0 && i < m->p && j < m->p, - "invalid arguments to sqmatrix-ref"); - c.re = SCALAR_RE(m->data[i * m->p + j]); - c.im = SCALAR_IM(m->data[i * m->p + j]); - scm_remember_upto_here_1(mo); - return c; +cnumber sqmatrix_ref(SCM mo, integer i, integer j) { + sqmatrix *m = assert_sqmatrix_smob(mo); + cnumber c; + CHECK(m && i >= 0 && j >= 0 && i < m->p && j < m->p, "invalid arguments to sqmatrix-ref"); + c.re = SCALAR_RE(m->data[i * m->p + j]); + c.im = SCALAR_IM(m->data[i * m->p + j]); + scm_remember_upto_here_1(mo); + return c; } -SCM sqmatrix_mult(SCM Ao, SCM Bo) -{ - sqmatrix *A = assert_sqmatrix_smob(Ao); - sqmatrix *B = assert_sqmatrix_smob(Bo); - sqmatrix C; - SCM obj; - CHECK(A->p == B->p, "only equal-size matrices can be multiplied"); - C = create_sqmatrix(A->p); - sqmatrix_AeBC(C, *A, 0, *B, 0); - obj = sqmatrix2scm(C); - destroy_sqmatrix(C); - scm_remember_upto_here_2(Ao,Bo); - return obj; +SCM sqmatrix_mult(SCM Ao, SCM Bo) { + sqmatrix *A = assert_sqmatrix_smob(Ao); + sqmatrix *B = assert_sqmatrix_smob(Bo); + sqmatrix C; + SCM obj; + CHECK(A->p == B->p, "only equal-size matrices can be multiplied"); + C = create_sqmatrix(A->p); + sqmatrix_AeBC(C, *A, 0, *B, 0); + obj = sqmatrix2scm(C); + destroy_sqmatrix(C); + scm_remember_upto_here_2(Ao, Bo); + return obj; } -SCM sqmatrix_diagm(cnumber_list d) -{ - int i, p = d.num_items; - SCM obj; - sqmatrix D = create_sqmatrix(p); - for (i = 0; i < p*p; ++i) - ASSIGN_ZERO(D.data[i]); - for (i = 0; i < p; ++i) - ASSIGN_SCALAR(D.data[i*p+i], d.items[i].re, d.items[i].im); - obj = sqmatrix2scm(D); - destroy_sqmatrix(D); - return obj; +SCM sqmatrix_diagm(cnumber_list d) { + int i, p = d.num_items; + SCM obj; + sqmatrix D = create_sqmatrix(p); + for (i = 0; i < p * p; ++i) + ASSIGN_ZERO(D.data[i]); + for (i = 0; i < p; ++i) + ASSIGN_SCALAR(D.data[i * p + i], d.items[i].re, d.items[i].im); + obj = sqmatrix2scm(D); + destroy_sqmatrix(D); + return obj; } -cnumber_list sqmatrix_eigvals(SCM Ao) -{ - sqmatrix *A = assert_sqmatrix_smob(Ao); - cnumber_list eigvals; - eigvals.num_items = A->p; - CHK_MALLOC(eigvals.items, cnumber, A->p); - sqmatrix_eigenvalues(*A, (scalar_complex *) eigvals.items); - scm_remember_upto_here_1(Ao); - return eigvals; +cnumber_list sqmatrix_eigvals(SCM Ao) { + sqmatrix *A = assert_sqmatrix_smob(Ao); + cnumber_list eigvals; + eigvals.num_items = A->p; + CHK_MALLOC(eigvals.items, cnumber, A->p); + sqmatrix_eigenvalues(*A, (scalar_complex *)eigvals.items); + scm_remember_upto_here_1(Ao); + return eigvals; } /*************************************************************************/ -SCM get_eigenvectors(integer b_start, integer num_bands) -{ - CHECK(mdata, "init-params must be called before get-eigenvectors"); +SCM get_eigenvectors(integer b_start, integer num_bands) { + CHECK(mdata, "init-params must be called before get-eigenvectors"); - return evectmatrix_slice2scm(H, b_start - 1, num_bands); + return evectmatrix_slice2scm(H, b_start - 1, num_bands); } -void set_eigenvectors(SCM mo, integer b_start) -{ - evectmatrix *m = assert_evectmatrix_smob(mo); - CHECK(mdata, "init-params must be called before set-eigenvectors"); +void set_eigenvectors(SCM mo, integer b_start) { + evectmatrix *m = assert_evectmatrix_smob(mo); + CHECK(mdata, "init-params must be called before set-eigenvectors"); - evectmatrix_copy_slice(H, *m, b_start - 1, 0, m->p); - curfield_reset(); - scm_remember_upto_here_1(mo); + evectmatrix_copy_slice(H, *m, b_start - 1, 0, m->p); + curfield_reset(); + scm_remember_upto_here_1(mo); } -SCM dot_eigenvectors(SCM mo, integer b_start) -{ - evectmatrix *m = assert_evectmatrix_smob(mo); - sqmatrix U; - SCM obj; - int final_band = b_start-1 + m->p; - - CHECK(mdata, "init-params must be called before dot-eigenvectors"); - CHECK(final_band <= num_bands, "not enough bands in dot-eigenvectors"); - - U = create_sqmatrix(m->p); - if (mdata->mu_inv == NULL) { - sqmatrix S = create_sqmatrix(m->p); - evectmatrix_XtY_slice(U, *m, H, 0, b_start - 1, m->p, S); - destroy_sqmatrix(S); - } - else { - /* ...we have to do this in blocks of eigensolver_block_size since - the work matrix W[0] may not have enough space to do it at once. */ - int ib; - sqmatrix S1 = create_sqmatrix(m->p); - sqmatrix S2 = create_sqmatrix(m->p); - - for (ib = b_start-1; ib < final_band; ib += W[0].alloc_p) { - if (ib + W[0].alloc_p > final_band) { - maxwell_set_num_bands(mdata, final_band - ib); - evectmatrix_resize(&W[0], final_band - ib, 0); - } - maxwell_compute_H_from_B(mdata, H, W[0], - (scalar_complex *) mdata->fft_data, - ib, 0, W[0].p); - - evectmatrix_XtY_slice2(U, *m, W[0], 0, 0, m->p, W[0].p, - ib-(b_start-1), S1, S2); - } - - /* Reset scratch matrix sizes: */ - evectmatrix_resize(&W[0], W[0].alloc_p, 0); - maxwell_set_num_bands(mdata, W[0].alloc_p); - - destroy_sqmatrix(S2); - destroy_sqmatrix(S1); - } - obj = sqmatrix2scm(U); - destroy_sqmatrix(U); - scm_remember_upto_here_1(mo); - return obj; +SCM dot_eigenvectors(SCM mo, integer b_start) { + evectmatrix *m = assert_evectmatrix_smob(mo); + sqmatrix U; + SCM obj; + int final_band = b_start - 1 + m->p; + + CHECK(mdata, "init-params must be called before dot-eigenvectors"); + CHECK(final_band <= num_bands, "not enough bands in dot-eigenvectors"); + + U = create_sqmatrix(m->p); + if (mdata->mu_inv == NULL) { + sqmatrix S = create_sqmatrix(m->p); + evectmatrix_XtY_slice(U, *m, H, 0, b_start - 1, m->p, S); + destroy_sqmatrix(S); + } + else { + /* ...we have to do this in blocks of eigensolver_block_size since + the work matrix W[0] may not have enough space to do it at once. */ + int ib; + sqmatrix S1 = create_sqmatrix(m->p); + sqmatrix S2 = create_sqmatrix(m->p); + + for (ib = b_start - 1; ib < final_band; ib += W[0].alloc_p) { + if (ib + W[0].alloc_p > final_band) { + maxwell_set_num_bands(mdata, final_band - ib); + evectmatrix_resize(&W[0], final_band - ib, 0); + } + maxwell_compute_H_from_B(mdata, H, W[0], (scalar_complex *)mdata->fft_data, ib, 0, W[0].p); + + evectmatrix_XtY_slice2(U, *m, W[0], 0, 0, m->p, W[0].p, ib - (b_start - 1), S1, S2); + } + + /* Reset scratch matrix sizes: */ + evectmatrix_resize(&W[0], W[0].alloc_p, 0); + maxwell_set_num_bands(mdata, W[0].alloc_p); + + destroy_sqmatrix(S2); + destroy_sqmatrix(S1); + } + obj = sqmatrix2scm(U); + destroy_sqmatrix(U); + scm_remember_upto_here_1(mo); + return obj; } -void scale_eigenvector(integer b, cnumber scale) -{ - scalar s; +void scale_eigenvector(integer b, cnumber scale) { + scalar s; - CHECK(mdata, "init-params must be called before scale-eigenvector"); - CHECK(b > 0 && b <= H.p, "invalid band number in scale-eigenvector"); + CHECK(mdata, "init-params must be called before scale-eigenvector"); + CHECK(b > 0 && b <= H.p, "invalid band number in scale-eigenvector"); #ifndef SCALAR_COMPLEX - CHECK(fabs(cnumber_im(scale) * cnumber_re(scale)) < 1e-14, - "scale-eigenvector must be called with real argument in mpbi"); + CHECK(fabs(cnumber_im(scale) * cnumber_re(scale)) < 1e-14, + "scale-eigenvector must be called with real argument in mpbi"); #endif - ASSIGN_SCALAR(s, cnumber_re(scale), cnumber_im(scale)); - blasglue_scal(H.n, s, H.data + b-1, H.p); - curfield_reset(); + ASSIGN_SCALAR(s, cnumber_re(scale), cnumber_im(scale)); + blasglue_scal(H.n, s, H.data + b - 1, H.p); + curfield_reset(); } -void output_eigenvectors(SCM mo, char *filename) -{ - evectmatrix *m = assert_evectmatrix_smob(mo); - evectmatrixio_writeall_raw(filename, *m); - curfield_reset(); - scm_remember_upto_here_1(mo); +void output_eigenvectors(SCM mo, char *filename) { + evectmatrix *m = assert_evectmatrix_smob(mo); + evectmatrixio_writeall_raw(filename, *m); + curfield_reset(); + scm_remember_upto_here_1(mo); } -SCM input_eigenvectors(char *filename, integer num_bands) -{ - SCM mo = get_eigenvectors(1, num_bands); - { - evectmatrix *m = assert_evectmatrix_smob(mo); - evectmatrixio_readall_raw(filename, *m); - } - scm_remember_upto_here_1(mo); - return mo; +SCM input_eigenvectors(char *filename, integer num_bands) { + SCM mo = get_eigenvectors(1, num_bands); + { + evectmatrix *m = assert_evectmatrix_smob(mo); + evectmatrixio_readall_raw(filename, *m); + } + scm_remember_upto_here_1(mo); + return mo; } -void save_eigenvectors(char *filename) -{ - CHECK(mdata, "init-params must be called before save-eigenvectors"); - printf("Saving eigenvectors to \"%s\"...\n", filename); - evectmatrixio_writeall_raw(filename, H); +void save_eigenvectors(char *filename) { + CHECK(mdata, "init-params must be called before save-eigenvectors"); + printf("Saving eigenvectors to \"%s\"...\n", filename); + evectmatrixio_writeall_raw(filename, H); } -void load_eigenvectors(char *filename) -{ - CHECK(mdata, "init-params must be called before load-eigenvectors"); - printf("Loading eigenvectors from \"%s\"...\n", filename); - evectmatrixio_readall_raw(filename, H); - curfield_reset(); +void load_eigenvectors(char *filename) { + CHECK(mdata, "init-params must be called before load-eigenvectors"); + printf("Loading eigenvectors from \"%s\"...\n", filename); + evectmatrixio_readall_raw(filename, H); + curfield_reset(); } /*************************************************************************/ diff --git a/mpb/medium.c b/mpb/medium.c index 818553e2..7b506e08 100644 --- a/mpb/medium.c +++ b/mpb/medium.c @@ -29,42 +29,33 @@ int no_size_x = 0, no_size_y = 0, no_size_z = 0; const int negative_mu_okp = 0; /* mu must always be > 0 */ -geom_box_tree geometry_tree = NULL; /* recursive tree of geometry - objects for fast searching */ +geom_box_tree geometry_tree = NULL; /* recursive tree of geometry + objects for fast searching */ /**************************************************************************/ typedef struct { - maxwell_dielectric_function epsilon_file_func; - void *epsilon_file_func_data; - maxwell_dielectric_function mu_file_func; - void *mu_file_func_data; + maxwell_dielectric_function epsilon_file_func; + void *epsilon_file_func_data; + maxwell_dielectric_function mu_file_func; + void *mu_file_func_data; } medium_func_data; -static material_type make_medium(double epsilon, double mu) -{ - material_type m; - m.which_subclass = MEDIUM; - CHK_MALLOC(m.subclass.medium_data, medium, 1); - m.subclass.medium_data->epsilon = epsilon; - m.subclass.medium_data->mu = mu; - return m; +static material_type make_medium(double epsilon, double mu) { + material_type m; + m.which_subclass = MEDIUM; + CHK_MALLOC(m.subclass.medium_data, medium, 1); + m.subclass.medium_data->epsilon = epsilon; + m.subclass.medium_data->mu = mu; + return m; } -static material_type make_epsilon(double epsilon) -{ - return make_medium(epsilon, 1.0); -} +static material_type make_epsilon(double epsilon) { return make_medium(epsilon, 1.0); } -static material_type make_mu(double mu) -{ - return make_medium(1.0, mu); -} +static material_type make_mu(double mu) { return make_medium(1.0, mu); } -static int variable_material(int which_subclass) -{ - return (which_subclass == MATERIAL_GRID || - which_subclass == MATERIAL_FUNCTION); +static int variable_material(int which_subclass) { + return (which_subclass == MATERIAL_GRID || which_subclass == MATERIAL_FUNCTION); } /**************************************************************************/ @@ -77,156 +68,131 @@ static int variable_material(int which_subclass) /**************************************************************************/ -static int material_has_mu(material_type m) -{ - if (m.which_subclass == MEDIUM) - return m.subclass.medium_data->mu != 1; - if (m.which_subclass == MEDIUM_ANISOTROPIC) { - medium_anisotropic *a = m.subclass.medium_anisotropic_data; - return (a->mu_diag.x != 1 || - a->mu_diag.y != 1 || - a->mu_diag.z != 1 || - a->mu_offdiag.x.re != 0 || - a->mu_offdiag.y.re != 0 || - a->mu_offdiag.z.re != 0 || - a->mu_offdiag.x.im + a->mu_offdiag_imag.x != 0 || - a->mu_offdiag.y.im + a->mu_offdiag_imag.y != 0 || - a->mu_offdiag.z.im + a->mu_offdiag_imag.z != 0); - } - if (m.which_subclass == MATERIAL_GRID) - return (m.subclass.material_grid_data->mu_min != 1 || - m.subclass.material_grid_data->mu_max != 1); - return 0; +static int material_has_mu(material_type m) { + if (m.which_subclass == MEDIUM) return m.subclass.medium_data->mu != 1; + if (m.which_subclass == MEDIUM_ANISOTROPIC) { + medium_anisotropic *a = m.subclass.medium_anisotropic_data; + return (a->mu_diag.x != 1 || a->mu_diag.y != 1 || a->mu_diag.z != 1 || + a->mu_offdiag.x.re != 0 || a->mu_offdiag.y.re != 0 || a->mu_offdiag.z.re != 0 || + a->mu_offdiag.x.im + a->mu_offdiag_imag.x != 0 || + a->mu_offdiag.y.im + a->mu_offdiag_imag.y != 0 || + a->mu_offdiag.z.im + a->mu_offdiag_imag.z != 0); + } + if (m.which_subclass == MATERIAL_GRID) + return (m.subclass.material_grid_data->mu_min != 1 || + m.subclass.material_grid_data->mu_max != 1); + return 0; } /* return true if we could potentially have mu != 1 */ -static int has_mu(medium_func_data *d) -{ - int i; - if (d->mu_file_func || force_mup || - material_has_mu(default_material)) - return 1; - for (i = 0; i < geometry.num_items; ++i) - if (material_has_mu(geometry.items[i].material)) - return 1; - return 0; +static int has_mu(medium_func_data *d) { + int i; + if (d->mu_file_func || force_mup || material_has_mu(default_material)) return 1; + for (i = 0; i < geometry.num_items; ++i) + if (material_has_mu(geometry.items[i].material)) return 1; + return 0; } /**************************************************************************/ -void reset_epsilon(void) -{ - medium_func_data d; - int mesh[3]; - - mesh[0] = mesh_size; - mesh[1] = (dimensions > 1) ? mesh_size : 1; - mesh[2] = (dimensions > 2) ? mesh_size : 1; - - get_epsilon_file_func(epsilon_input_file, - &d.epsilon_file_func, &d.epsilon_file_func_data); - get_epsilon_file_func(mu_input_file, - &d.mu_file_func, &d.mu_file_func_data); - mpi_one_printf("Initializing epsilon function...\n"); - set_maxwell_dielectric(mdata, mesh, R, G, - epsilon_func, mean_epsilon_func, &d); - if (has_mu(&d)) { - mpi_one_printf("Initializing mu function...\n"); - set_maxwell_mu(mdata, mesh, R, G, - mu_func, mean_mu_func, &d); - } - destroy_epsilon_file_func_data(d.epsilon_file_func_data); - destroy_epsilon_file_func_data(d.mu_file_func_data); +void reset_epsilon(void) { + medium_func_data d; + int mesh[3]; + + mesh[0] = mesh_size; + mesh[1] = (dimensions > 1) ? mesh_size : 1; + mesh[2] = (dimensions > 2) ? mesh_size : 1; + + get_epsilon_file_func(epsilon_input_file, &d.epsilon_file_func, &d.epsilon_file_func_data); + get_epsilon_file_func(mu_input_file, &d.mu_file_func, &d.mu_file_func_data); + mpi_one_printf("Initializing epsilon function...\n"); + set_maxwell_dielectric(mdata, mesh, R, G, epsilon_func, mean_epsilon_func, &d); + if (has_mu(&d)) { + mpi_one_printf("Initializing mu function...\n"); + set_maxwell_mu(mdata, mesh, R, G, mu_func, mean_mu_func, &d); + } + destroy_epsilon_file_func_data(d.epsilon_file_func_data); + destroy_epsilon_file_func_data(d.mu_file_func_data); } /* Initialize the dielectric function of the global mdata structure, along with other geometry data. Should be called from init-params, or in general when global input vars have been loaded and mdata allocated. */ -void init_epsilon(void) -{ - int i; - int tree_depth, tree_nobjects; - number no_size; - - no_size = 2.0 / ctl_get_number("infinity"); - - mpi_one_printf("Mesh size is %d.\n", mesh_size); - - no_size_x = geometry_lattice.size.x <= no_size; - no_size_y = geometry_lattice.size.y <= no_size || dimensions < 2; - no_size_z = geometry_lattice.size.z <= no_size || dimensions < 3; - - Rm.c0 = vector3_scale(no_size_x ? 1 : geometry_lattice.size.x, - geometry_lattice.basis.c0); - Rm.c1 = vector3_scale(no_size_y ? 1 : geometry_lattice.size.y, - geometry_lattice.basis.c1); - Rm.c2 = vector3_scale(no_size_z ? 1 : geometry_lattice.size.z, - geometry_lattice.basis.c2); - mpi_one_printf("Lattice vectors:\n"); - mpi_one_printf(" (%g, %g, %g)\n", Rm.c0.x, Rm.c0.y, Rm.c0.z); - mpi_one_printf(" (%g, %g, %g)\n", Rm.c1.x, Rm.c1.y, Rm.c1.z); - mpi_one_printf(" (%g, %g, %g)\n", Rm.c2.x, Rm.c2.y, Rm.c2.z); - Vol = fabs(matrix3x3_determinant(Rm)); - mpi_one_printf("Cell volume = %g\n", Vol); - - Gm = matrix3x3_inverse(matrix3x3_transpose(Rm)); - mpi_one_printf("Reciprocal lattice vectors (/ 2 pi):\n"); - mpi_one_printf(" (%g, %g, %g)\n", Gm.c0.x, Gm.c0.y, Gm.c0.z); - mpi_one_printf(" (%g, %g, %g)\n", Gm.c1.x, Gm.c1.y, Gm.c1.z); - mpi_one_printf(" (%g, %g, %g)\n", Gm.c2.x, Gm.c2.y, Gm.c2.z); - - if (eigensolver_nwork > MAX_NWORK) { - mpi_one_printf("(Reducing nwork = %d to maximum: %d.)\n", - eigensolver_nwork, MAX_NWORK); - eigensolver_nwork = MAX_NWORK; - } - - matrix3x3_to_arr(R, Rm); - matrix3x3_to_arr(G, Gm); - - /* we must do this to correct for a non-orthogonal lattice basis: */ - geom_fix_objects(); - - mpi_one_printf("Geometric objects:\n"); - if (mpi_is_master()) - for (i = 0; i < geometry.num_items; ++i) { - display_geometric_object_info(5, geometry.items[i]); - - if (geometry.items[i].material.which_subclass == MEDIUM) - printf("%*sepsilon = %g, mu = %g\n", - 5 + 5, "", - geometry.items[i].material. - subclass.medium_data->epsilon, - geometry.items[i].material. - subclass.medium_data->mu); - } - - destroy_geom_box_tree(geometry_tree); /* destroy any tree from - previous runs */ - { - geom_box b0; - b0.low = vector3_plus(geometry_center, - vector3_scale(-0.5, geometry_lattice.size)); - b0.high = vector3_plus(geometry_center, - vector3_scale(0.5, geometry_lattice.size)); - /* pad tree boundaries to allow for sub-pixel averaging */ - b0.low.x -= geometry_lattice.size.x / mdata->nx; - b0.low.y -= geometry_lattice.size.y / mdata->ny; - b0.low.z -= geometry_lattice.size.z / mdata->nz; - b0.high.x += geometry_lattice.size.x / mdata->nx; - b0.high.y += geometry_lattice.size.y / mdata->ny; - b0.high.z += geometry_lattice.size.z / mdata->nz; - geometry_tree = create_geom_box_tree0(geometry, b0); - } - if (verbose && mpi_is_master()) { - printf("Geometry object bounding box tree:\n"); - display_geom_box_tree(5, geometry_tree); - } - geom_box_tree_stats(geometry_tree, &tree_depth, &tree_nobjects); - mpi_one_printf("Geometric object tree has depth %d and %d object nodes" - " (vs. %d actual objects)\n", - tree_depth, tree_nobjects, geometry.num_items); - - reset_epsilon(); +void init_epsilon(void) { + int i; + int tree_depth, tree_nobjects; + number no_size; + + no_size = 2.0 / ctl_get_number("infinity"); + + mpi_one_printf("Mesh size is %d.\n", mesh_size); + + no_size_x = geometry_lattice.size.x <= no_size; + no_size_y = geometry_lattice.size.y <= no_size || dimensions < 2; + no_size_z = geometry_lattice.size.z <= no_size || dimensions < 3; + + Rm.c0 = vector3_scale(no_size_x ? 1 : geometry_lattice.size.x, geometry_lattice.basis.c0); + Rm.c1 = vector3_scale(no_size_y ? 1 : geometry_lattice.size.y, geometry_lattice.basis.c1); + Rm.c2 = vector3_scale(no_size_z ? 1 : geometry_lattice.size.z, geometry_lattice.basis.c2); + mpi_one_printf("Lattice vectors:\n"); + mpi_one_printf(" (%g, %g, %g)\n", Rm.c0.x, Rm.c0.y, Rm.c0.z); + mpi_one_printf(" (%g, %g, %g)\n", Rm.c1.x, Rm.c1.y, Rm.c1.z); + mpi_one_printf(" (%g, %g, %g)\n", Rm.c2.x, Rm.c2.y, Rm.c2.z); + Vol = fabs(matrix3x3_determinant(Rm)); + mpi_one_printf("Cell volume = %g\n", Vol); + + Gm = matrix3x3_inverse(matrix3x3_transpose(Rm)); + mpi_one_printf("Reciprocal lattice vectors (/ 2 pi):\n"); + mpi_one_printf(" (%g, %g, %g)\n", Gm.c0.x, Gm.c0.y, Gm.c0.z); + mpi_one_printf(" (%g, %g, %g)\n", Gm.c1.x, Gm.c1.y, Gm.c1.z); + mpi_one_printf(" (%g, %g, %g)\n", Gm.c2.x, Gm.c2.y, Gm.c2.z); + + if (eigensolver_nwork > MAX_NWORK) { + mpi_one_printf("(Reducing nwork = %d to maximum: %d.)\n", eigensolver_nwork, MAX_NWORK); + eigensolver_nwork = MAX_NWORK; + } + + matrix3x3_to_arr(R, Rm); + matrix3x3_to_arr(G, Gm); + + /* we must do this to correct for a non-orthogonal lattice basis: */ + geom_fix_objects(); + + mpi_one_printf("Geometric objects:\n"); + if (mpi_is_master()) + for (i = 0; i < geometry.num_items; ++i) { + display_geometric_object_info(5, geometry.items[i]); + + if (geometry.items[i].material.which_subclass == MEDIUM) + printf("%*sepsilon = %g, mu = %g\n", 5 + 5, "", + geometry.items[i].material.subclass.medium_data->epsilon, + geometry.items[i].material.subclass.medium_data->mu); + } + + destroy_geom_box_tree(geometry_tree); /* destroy any tree from + previous runs */ + { + geom_box b0; + b0.low = vector3_plus(geometry_center, vector3_scale(-0.5, geometry_lattice.size)); + b0.high = vector3_plus(geometry_center, vector3_scale(0.5, geometry_lattice.size)); + /* pad tree boundaries to allow for sub-pixel averaging */ + b0.low.x -= geometry_lattice.size.x / mdata->nx; + b0.low.y -= geometry_lattice.size.y / mdata->ny; + b0.low.z -= geometry_lattice.size.z / mdata->nz; + b0.high.x += geometry_lattice.size.x / mdata->nx; + b0.high.y += geometry_lattice.size.y / mdata->ny; + b0.high.z += geometry_lattice.size.z / mdata->nz; + geometry_tree = create_geom_box_tree0(geometry, b0); + } + if (verbose && mpi_is_master()) { + printf("Geometry object bounding box tree:\n"); + display_geom_box_tree(5, geometry_tree); + } + geom_box_tree_stats(geometry_tree, &tree_depth, &tree_nobjects); + mpi_one_printf("Geometric object tree has depth %d and %d object nodes" + " (vs. %d actual objects)\n", + tree_depth, tree_nobjects, geometry.num_items); + + reset_epsilon(); } diff --git a/mpb/mpb.c b/mpb/mpb.c index d8fc9594..3e04fbbf 100644 --- a/mpb/mpb.c +++ b/mpb/mpb.c @@ -55,13 +55,13 @@ #include "field-smob.h" #if defined(DEBUG) && defined(HAVE_FEENABLEEXCEPT) -# ifndef _GNU_SOURCE -# define _GNU_SOURCE 1 -# endif -# include -# if !HAVE_DECL_FEENABLEEXCEPT -int feenableexcept (int EXCEPTS); -# endif +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include +#if !HAVE_DECL_FEENABLEEXCEPT +int feenableexcept(int EXCEPTS); +#endif #endif /**************************************************************************/ @@ -71,78 +71,74 @@ int feenableexcept (int EXCEPTS); them to initialize MPI and OpenMP */ #if defined(SCALAR_SINGLE_PREC) -# define FFTW(x) fftwf_ ## x +#define FFTW(x) fftwf_##x #elif defined(SCALAR_LONG_DOUBLE_PREC) -# define FFTW(x) fftwl_ ## x +#define FFTW(x) fftwl_##x #else -# define FFTW(x) fftw_ ## x +#define FFTW(x) fftw_##x #endif #ifdef USE_OPENMP -# include -# include +#include +#include #endif -#if defined(HAVE_MPI) && (defined(HAVE_LIBFFTW3F_MPI) || defined(HAVE_LIBFFTW3L_MPI) || defined(HAVE_LIBFFTW3_MPI)) -# define HAVE_FFTW3_MPI -# include +#if defined(HAVE_MPI) && \ + (defined(HAVE_LIBFFTW3F_MPI) || defined(HAVE_LIBFFTW3L_MPI) || defined(HAVE_LIBFFTW3_MPI)) +#define HAVE_FFTW3_MPI +#include #endif -void ctl_start_hook(int *argc, char ***argv) -{ - MPI_Init(argc, argv); +void ctl_start_hook(int *argc, char ***argv) { + MPI_Init(argc, argv); #ifdef USE_OPENMP - { - char *senthread = getenv("OMP_NUM_THREADS"); - int i, nthread = senthread ? (atoi(senthread) > 0 - ? atoi(senthread) : 1) : 1; - for (i = 0; i < *argc; ++i) - if (!strncasecmp("--nthread=", (*argv)[i], 10)) { - int j; - CHECK((nthread=atoi((*argv)[i]+10)) > 0, - "invalid argument for --nthread=..."); - *argc -= 1; - for (j = i; j < *argc; ++j) - (*argv)[j] = (*argv)[j+1]; - } - omp_set_num_threads(nthread); - CHECK(FFTW(init_threads)(), "error initializing threaded FFTW"); - FFTW(plan_with_nthreads)(nthread); - } + { + char *senthread = getenv("OMP_NUM_THREADS"); + int i, nthread = senthread ? (atoi(senthread) > 0 ? atoi(senthread) : 1) : 1; + for (i = 0; i < *argc; ++i) + if (!strncasecmp("--nthread=", (*argv)[i], 10)) { + int j; + CHECK((nthread = atoi((*argv)[i] + 10)) > 0, "invalid argument for --nthread=..."); + *argc -= 1; + for (j = i; j < *argc; ++j) + (*argv)[j] = (*argv)[j + 1]; + } + omp_set_num_threads(nthread); + CHECK(FFTW(init_threads)(), "error initializing threaded FFTW"); + FFTW(plan_with_nthreads)(nthread); + } #endif #ifdef HAVE_FFTW3_MPI - FFTW(mpi_init)(); + FFTW(mpi_init)(); #endif #ifdef HAVE_LIBCTL_QUIET - { - extern int libctl_quiet; - libctl_quiet = !mpi_is_master(); - } + { + extern int libctl_quiet; + libctl_quiet = !mpi_is_master(); + } #endif - /* disable: Guile garbage collector sometimes triggers an fp exception */ + /* disable: Guile garbage collector sometimes triggers an fp exception */ #if defined(DEBUG) && defined(HAVE_FEENABLEEXCEPT) && 0 - feenableexcept(FE_INVALID | FE_OVERFLOW); /* crash on NaN/overflow */ + feenableexcept(FE_INVALID | FE_OVERFLOW); /* crash on NaN/overflow */ #endif } -void ctl_stop_hook(void) -{ +void ctl_stop_hook(void) { #ifdef HAVE_FFTW3_MPI - FFTW(mpi_cleanup)(); + FFTW(mpi_cleanup)(); #endif - MPI_Finalize(); + MPI_Finalize(); } /* The following is a hook function called from main() when initializing Guile, which can export any additional symbols to Guile: */ -void ctl_export_hook(void) -{ - register_matrix_smobs(); - register_field_smobs(); +void ctl_export_hook(void) { + register_matrix_smobs(); + register_field_smobs(); } /**************************************************************************/ @@ -150,59 +146,50 @@ void ctl_export_hook(void) /* Some Guile-callable functions so that ctl files can know something about MPI. */ -boolean mpi_is_masterp(void) -{ - return mpi_is_master(); -} +boolean mpi_is_masterp(void) { return mpi_is_master(); } -boolean using_mpip(void) -{ +boolean using_mpip(void) { #ifdef HAVE_MPI - return 1; + return 1; #else - return 0; + return 0; #endif } -integer mpi_num_procs(void) -{ - int num_procs; - MPI_Comm_size(mpb_comm, &num_procs); - return num_procs; +integer mpi_num_procs(void) { + int num_procs; + MPI_Comm_size(mpb_comm, &num_procs); + return num_procs; } -integer mpi_proc_index(void) -{ - int proc_num; - MPI_Comm_rank(mpb_comm, &proc_num); - return proc_num; +integer mpi_proc_index(void) { + int proc_num; + MPI_Comm_rank(mpb_comm, &proc_num); + return proc_num; } -number mpi_max(number num) -{ - double x = num, xmax; - mpi_allreduce(&x, &xmax, 1, double, MPI_DOUBLE, MPI_MAX, mpb_comm); - return xmax; +number mpi_max(number num) { + double x = num, xmax; + mpi_allreduce(&x, &xmax, 1, double, MPI_DOUBLE, MPI_MAX, mpb_comm); + return xmax; } /**************************************************************************/ /* expose some build info to guile */ -boolean has_hermitian_epsp() -{ +boolean has_hermitian_epsp() { #if WITH_HERMITIAN_EPSILON - return 1; + return 1; #else - return 0; + return 0; #endif } -boolean has_inversion_symp() -{ +boolean has_inversion_symp() { #if SCALAR_COMPLEX - return 0; + return 0; #else - return 1; + return 1; #endif } @@ -211,18 +198,16 @@ boolean has_inversion_symp() /* a couple of utilities to convert libctl data types to the data types of the eigensolver & maxwell routines: */ -void vector3_to_arr(real arr[3], vector3 v) -{ - arr[0] = v.x; - arr[1] = v.y; - arr[2] = v.z; +void vector3_to_arr(real arr[3], vector3 v) { + arr[0] = v.x; + arr[1] = v.y; + arr[2] = v.z; } -void matrix3x3_to_arr(real arr[3][3], matrix3x3 m) -{ - vector3_to_arr(arr[0], m.c0); - vector3_to_arr(arr[1], m.c1); - vector3_to_arr(arr[2], m.c2); +void matrix3x3_to_arr(real arr[3][3], matrix3x3 m) { + vector3_to_arr(arr[0], m.c0); + vector3_to_arr(arr[1], m.c1); + vector3_to_arr(arr[2], m.c2); } /**************************************************************************/ @@ -241,125 +226,108 @@ scalar_complex *curfield = NULL; int curfield_band; char curfield_type = '-'; -void curfield_reset(void) { curfield = NULL; curfield_type = '-'; } +void curfield_reset(void) { + curfield = NULL; + curfield_type = '-'; +} /* R[i]/G[i] are lattice/reciprocal-lattice vectors */ real R[3][3], G[3][3]; matrix3x3 Rm, Gm; /* same thing, but matrix3x3 */ -real Vol; /* computational cell volume = |det Rm| */ +real Vol; /* computational cell volume = |det Rm| */ /* index of current kpoint, for labeling output */ int kpoint_index = 0; /**************************************************************************/ -scalar_complex cnumber2cscalar(cnumber c) -{ - scalar_complex cs; - CASSIGN_SCALAR(cs, cnumber_re(c), cnumber_im(c)); - return cs; +scalar_complex cnumber2cscalar(cnumber c) { + scalar_complex cs; + CASSIGN_SCALAR(cs, cnumber_re(c), cnumber_im(c)); + return cs; } -cnumber cscalar2cnumber(scalar_complex cs) -{ - return make_cnumber(CSCALAR_RE(cs), CSCALAR_IM(cs)); -} +cnumber cscalar2cnumber(scalar_complex cs) { return make_cnumber(CSCALAR_RE(cs), CSCALAR_IM(cs)); } -cvector3 cscalar32cvector3(const scalar_complex *cs) -{ - cvector3 v; - v.x = cscalar2cnumber(cs[0]); - v.y = cscalar2cnumber(cs[1]); - v.z = cscalar2cnumber(cs[2]); - return v; +cvector3 cscalar32cvector3(const scalar_complex *cs) { + cvector3 v; + v.x = cscalar2cnumber(cs[0]); + v.y = cscalar2cnumber(cs[1]); + v.z = cscalar2cnumber(cs[2]); + return v; } -void cvector32cscalar3(scalar_complex *cs, cvector3 v) -{ - cs[0] = cnumber2cscalar(v.x); - cs[1] = cnumber2cscalar(v.y); - cs[2] = cnumber2cscalar(v.z); +void cvector32cscalar3(scalar_complex *cs, cvector3 v) { + cs[0] = cnumber2cscalar(v.x); + cs[1] = cnumber2cscalar(v.y); + cs[2] = cnumber2cscalar(v.z); } /**************************************************************************/ /* initialize the field to random numbers; should only be called after init-params. (Guile-callable.) */ -void randomize_fields(void) -{ - int i; - - if (!mdata) - return; - mpi_one_printf("Initializing fields to random numbers...\n"); - for (i = 0; i < H.n * H.p; ++i) { - ASSIGN_SCALAR(H.data[i], rand() * 1.0 / RAND_MAX, - rand() * 1.0 / RAND_MAX); - } +void randomize_fields(void) { + int i; + + if (!mdata) return; + mpi_one_printf("Initializing fields to random numbers...\n"); + for (i = 0; i < H.n * H.p; ++i) { + ASSIGN_SCALAR(H.data[i], rand() * 1.0 / RAND_MAX, rand() * 1.0 / RAND_MAX); + } } /**************************************************************************/ /* Guile-callable functions for getting/setting the kpoint index. */ -integer get_kpoint_index(void) -{ - return kpoint_index; -} +integer get_kpoint_index(void) { return kpoint_index; } -void set_kpoint_index(integer i) -{ - kpoint_index = i; -} +void set_kpoint_index(integer i) { kpoint_index = i; } /**************************************************************************/ /* return a string describing the current parity, used for frequency and filename prefixes */ -const char *parity_string(maxwell_data *d) -{ - static char s[128]; - strcpy(s, ""); - if (d->parity & EVEN_Z_PARITY) - strcat(s, (d->nz == 1) ? "te" : "zeven"); - else if (d->parity & ODD_Z_PARITY) - strcat(s, (d->nz == 1) ? "tm" : "zodd"); - if (d->parity & EVEN_Y_PARITY) - strcat(s, "yeven"); - else if (d->parity & ODD_Y_PARITY) - strcat(s, "yodd"); - return s; +const char *parity_string(maxwell_data *d) { + static char s[128]; + strcpy(s, ""); + if (d->parity & EVEN_Z_PARITY) + strcat(s, (d->nz == 1) ? "te" : "zeven"); + else if (d->parity & ODD_Z_PARITY) + strcat(s, (d->nz == 1) ? "tm" : "zodd"); + if (d->parity & EVEN_Y_PARITY) + strcat(s, "yeven"); + else if (d->parity & ODD_Y_PARITY) + strcat(s, "yodd"); + return s; } /* Set the current parity to solve for. (init-params should have - already been called. (Guile-callable; see mpb.scm.) + already been called. (Guile-callable; see mpb.scm.) p >= 0 means a bitwise OR of the various parity constants from maxwell.h (NO_PARITY, EVEN_Z_PARITY, etcetera). - p = -1 means the parity of the previous call, + p = -1 means the parity of the previous call, or NO_PARITY if this is the first call */ -void set_parity(integer p) -{ - static int last_p = -2; /* initialize to some non-value */ +void set_parity(integer p) { + static int last_p = -2; /* initialize to some non-value */ - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before set-parity!\n"); - return; - } + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before set-parity!\n"); + return; + } - if (p == -1) - p = last_p < 0 ? NO_PARITY : last_p; + if (p == -1) p = last_p < 0 ? NO_PARITY : last_p; - set_maxwell_data_parity(mdata, p); - CHECK(mdata->parity == p, "k vector incompatible with parity"); - mpi_one_printf("Solving for band polarization: %s.\n", - parity_string(mdata)); + set_maxwell_data_parity(mdata, p); + CHECK(mdata->parity == p, "k vector incompatible with parity"); + mpi_one_printf("Solving for band polarization: %s.\n", parity_string(mdata)); - last_p = p; - set_kpoint_index(0); /* reset index */ + last_p = p; + set_kpoint_index(0); /* reset index */ } /**************************************************************************/ @@ -367,8 +335,8 @@ void set_parity(integer p) /* Guile-callable function: init-params, which initializes any data that we need for the eigenvalue calculation. When this function is called, the input variables (the geometry, etcetera) have already - been read into the global variables defined in ctl-io.h. - + been read into the global variables defined in ctl-io.h. + p is the parity to use for the coming calculation, although this can be changed by calling set-parity. p is interpreted in the same way as for set-parity. @@ -376,151 +344,133 @@ void set_parity(integer p) If reset_fields is false, then any fields from a previous run are retained if they are of the same dimensions. Otherwise, new fields are allocated and initialized to random numbers. */ -void init_params(integer p, boolean reset_fields) -{ - int i, local_N, N_start, alloc_N; - int nx, ny, nz; - int have_old_fields = 0; - int block_size; - - /* Output a bunch of stuff so that the user can see what we're - doing and what we've read in. */ - - mpi_one_printf("init-params: initializing eigensolver data\n"); +void init_params(integer p, boolean reset_fields) { + int i, local_N, N_start, alloc_N; + int nx, ny, nz; + int have_old_fields = 0; + int block_size; + + /* Output a bunch of stuff so that the user can see what we're + doing and what we've read in. */ + + mpi_one_printf("init-params: initializing eigensolver data\n"); #ifndef SCALAR_COMPLEX - mpi_one_printf(" -- assuming INVERSION SYMMETRY in the geometry.\n"); + mpi_one_printf(" -- assuming INVERSION SYMMETRY in the geometry.\n"); #endif - - mpi_one_printf("Computing %d bands with %e tolerance.\n", - num_bands, tolerance); - if (target_freq != 0.0) - mpi_one_printf("Target frequency is %g\n", target_freq); - - get_grid_size_n(&nx, &ny, &nz); - - { - int true_rank = nz > 1 ? 3 : (ny > 1 ? 2 : 1); - if (true_rank < dimensions) - dimensions = true_rank; - else if (true_rank > dimensions) { - mpi_one_fprintf(stderr, - "WARNING: rank of grid is > dimensions.\n" - " setting extra grid dims. to 1.\n"); - /* force extra dims to be 1: */ - if (dimensions <= 2) - nz = 1; - if (dimensions <= 1) - ny = 1; - } - } - - mpi_one_printf("Working in %d dimensions.\n", dimensions); - mpi_one_printf("Grid size is %d x %d x %d.\n", nx, ny, nz); - - if (eigensolver_block_size != 0 && eigensolver_block_size < num_bands) { - block_size = eigensolver_block_size; - if (block_size < 0) { - /* Guess a block_size near -block_size, chosen so that - all blocks are nearly equal in size: */ - block_size = (num_bands - block_size - 1) / (-block_size); - block_size = (num_bands + block_size - 1) / block_size; - } - mpi_one_printf("Solving for %d bands at a time.\n", block_size); - } - else - block_size = num_bands; - - if (mdata) { /* need to clean up from previous init_params call */ - if (nx == mdata->nx && ny == mdata->ny && nz == mdata->nz && - block_size == Hblock.alloc_p && num_bands == H.p && - eigensolver_nwork + (mdata->mu_inv!=NULL) == nwork_alloc) - have_old_fields = 1; /* don't need to reallocate */ - else { - destroy_evectmatrix(H); - for (i = 0; i < nwork_alloc; ++i) - destroy_evectmatrix(W[i]); - if (Hblock.data != H.data) - destroy_evectmatrix(Hblock); - if (muinvH.data != H.data) - destroy_evectmatrix(muinvH); - } - destroy_maxwell_target_data(mtdata); mtdata = NULL; - destroy_maxwell_data(mdata); mdata = NULL; - curfield_reset(); - } - else - srand(time(NULL)); /* init random seed for field initialization */ - - if (deterministicp) { /* check input variable "deterministic?" */ - /* seed should be the same for each run, although - it should be different for each process: */ - int rank; - MPI_Comm_rank(mpb_comm, &rank); - srand(314159 * (rank + 1)); - } - - mpi_one_printf("Creating Maxwell data...\n"); - mdata = create_maxwell_data(nx, ny, nz, &local_N, &N_start, &alloc_N, - block_size, NUM_FFT_BANDS); - CHECK(mdata, "NULL mdata"); - - if (target_freq != 0.0) - mtdata = create_maxwell_target_data(mdata, target_freq); - else - mtdata = NULL; - - init_epsilon(); - - if (!have_old_fields) { - mpi_one_printf("Allocating fields...\n"); - H = create_evectmatrix(nx * ny * nz, 2, num_bands, - local_N, N_start, alloc_N); - nwork_alloc = eigensolver_nwork + (mdata->mu_inv!=NULL); - for (i = 0; i < nwork_alloc; ++i) - W[i] = create_evectmatrix(nx * ny * nz, 2, block_size, - local_N, N_start, alloc_N); - if (block_size < num_bands) - Hblock = create_evectmatrix(nx * ny * nz, 2, block_size, - local_N, N_start, alloc_N); - else - Hblock = H; - if (using_mup() && block_size < num_bands) { - muinvH = create_evectmatrix(nx * ny * nz, 2, num_bands, - local_N, N_start, alloc_N); - } - else { - muinvH = H; - } - } - - mpi_one_printf("%d k-points:\n", k_points.num_items); - for (i = 0; i < k_points.num_items; ++i) - mpi_one_printf(" (%g,%g,%g)\n", k_points.items[i].x, - k_points.items[i].y, k_points.items[i].z); - - set_parity(p); - if (!have_old_fields || reset_fields) - randomize_fields(); - - { - int ierr = check_maxwell_dielectric(mdata, negative_epsilon_okp); - if (ierr == 1) - mpi_one_fprintf(stderr, - "ERROR: non positive-definite dielectric tensor\n"); - else if (ierr == 2) - mpi_one_fprintf(stderr, - "ERROR: dielectric tensor must not couple xy " - "plane with z direction for 2D TE/TM calculations\n"); - CHECK(!ierr, "invalid dielectric function\n"); - } - - evectmatrix_flops = eigensolver_flops; /* reset, if changed */ -} -boolean using_mup(void) -{ - return mdata && mdata->mu_inv != NULL; -} + mpi_one_printf("Computing %d bands with %e tolerance.\n", num_bands, tolerance); + if (target_freq != 0.0) mpi_one_printf("Target frequency is %g\n", target_freq); + + get_grid_size_n(&nx, &ny, &nz); + + { + int true_rank = nz > 1 ? 3 : (ny > 1 ? 2 : 1); + if (true_rank < dimensions) + dimensions = true_rank; + else if (true_rank > dimensions) { + mpi_one_fprintf(stderr, "WARNING: rank of grid is > dimensions.\n" + " setting extra grid dims. to 1.\n"); + /* force extra dims to be 1: */ + if (dimensions <= 2) nz = 1; + if (dimensions <= 1) ny = 1; + } + } + + mpi_one_printf("Working in %d dimensions.\n", dimensions); + mpi_one_printf("Grid size is %d x %d x %d.\n", nx, ny, nz); + + if (eigensolver_block_size != 0 && eigensolver_block_size < num_bands) { + block_size = eigensolver_block_size; + if (block_size < 0) { + /* Guess a block_size near -block_size, chosen so that + all blocks are nearly equal in size: */ + block_size = (num_bands - block_size - 1) / (-block_size); + block_size = (num_bands + block_size - 1) / block_size; + } + mpi_one_printf("Solving for %d bands at a time.\n", block_size); + } + else + block_size = num_bands; + + if (mdata) { /* need to clean up from previous init_params call */ + if (nx == mdata->nx && ny == mdata->ny && nz == mdata->nz && block_size == Hblock.alloc_p && + num_bands == H.p && eigensolver_nwork + (mdata->mu_inv != NULL) == nwork_alloc) + have_old_fields = 1; /* don't need to reallocate */ + else { + destroy_evectmatrix(H); + for (i = 0; i < nwork_alloc; ++i) + destroy_evectmatrix(W[i]); + if (Hblock.data != H.data) destroy_evectmatrix(Hblock); + if (muinvH.data != H.data) destroy_evectmatrix(muinvH); + } + destroy_maxwell_target_data(mtdata); + mtdata = NULL; + destroy_maxwell_data(mdata); + mdata = NULL; + curfield_reset(); + } + else + srand(time(NULL)); /* init random seed for field initialization */ + + if (deterministicp) { /* check input variable "deterministic?" */ + /* seed should be the same for each run, although + it should be different for each process: */ + int rank; + MPI_Comm_rank(mpb_comm, &rank); + srand(314159 * (rank + 1)); + } + + mpi_one_printf("Creating Maxwell data...\n"); + mdata = create_maxwell_data(nx, ny, nz, &local_N, &N_start, &alloc_N, block_size, NUM_FFT_BANDS); + CHECK(mdata, "NULL mdata"); + + if (target_freq != 0.0) + mtdata = create_maxwell_target_data(mdata, target_freq); + else + mtdata = NULL; + + init_epsilon(); + + if (!have_old_fields) { + mpi_one_printf("Allocating fields...\n"); + H = create_evectmatrix(nx * ny * nz, 2, num_bands, local_N, N_start, alloc_N); + nwork_alloc = eigensolver_nwork + (mdata->mu_inv != NULL); + for (i = 0; i < nwork_alloc; ++i) + W[i] = create_evectmatrix(nx * ny * nz, 2, block_size, local_N, N_start, alloc_N); + if (block_size < num_bands) + Hblock = create_evectmatrix(nx * ny * nz, 2, block_size, local_N, N_start, alloc_N); + else + Hblock = H; + if (using_mup() && block_size < num_bands) { + muinvH = create_evectmatrix(nx * ny * nz, 2, num_bands, local_N, N_start, alloc_N); + } + else { + muinvH = H; + } + } + + mpi_one_printf("%d k-points:\n", k_points.num_items); + for (i = 0; i < k_points.num_items; ++i) + mpi_one_printf(" (%g,%g,%g)\n", k_points.items[i].x, k_points.items[i].y, + k_points.items[i].z); + + set_parity(p); + if (!have_old_fields || reset_fields) randomize_fields(); + + { + int ierr = check_maxwell_dielectric(mdata, negative_epsilon_okp); + if (ierr == 1) + mpi_one_fprintf(stderr, "ERROR: non positive-definite dielectric tensor\n"); + else if (ierr == 2) + mpi_one_fprintf(stderr, "ERROR: dielectric tensor must not couple xy " + "plane with z direction for 2D TE/TM calculations\n"); + CHECK(!ierr, "invalid dielectric function\n"); + } + + evectmatrix_flops = eigensolver_flops; /* reset, if changed */ +} + +boolean using_mup(void) { return mdata && mdata->mu_inv != NULL; } /**************************************************************************/ @@ -530,321 +480,269 @@ boolean using_mup(void) commutes with the eigen-operator, of course, so all is well.) */ typedef struct { - evectmatrix Y; /* the vectors to orthogonalize against; Y must - itself be normalized (Yt B Y = 1) */ - evectmatrix BY; /* B * Y */ - int p; /* the number of columns of Y to orthogonalize against */ - scalar *S; /* a matrix for storing the dot products; should have - at least p * X.p elements (see below for X) */ - scalar *S2; /* a scratch matrix the same size as S */ + evectmatrix Y; /* the vectors to orthogonalize against; Y must + itself be normalized (Yt B Y = 1) */ + evectmatrix BY; /* B * Y */ + int p; /* the number of columns of Y to orthogonalize against */ + scalar *S; /* a matrix for storing the dot products; should have + at least p * X.p elements (see below for X) */ + scalar *S2; /* a scratch matrix the same size as S */ } deflation_data; -static void deflation_constraint(evectmatrix X, void *data) -{ - deflation_data *d = (deflation_data *) data; +static void deflation_constraint(evectmatrix X, void *data) { + deflation_data *d = (deflation_data *)data; - CHECK(X.n == d->BY.n && d->BY.p >= d->p && d->Y.p >= d->p, - "invalid dimensions"); + CHECK(X.n == d->BY.n && d->BY.p >= d->p && d->Y.p >= d->p, "invalid dimensions"); - /* compute (1 - Y (BY)t) X = (1 - Y Yt B) X - = projection of X so that Yt B X = 0 */ + /* compute (1 - Y (BY)t) X = (1 - Y Yt B) X + = projection of X so that Yt B X = 0 */ - /* (Sigh...call the BLAS functions directly since we are not - using all the columns of BY...evectmatrix is not set up for - this case.) */ + /* (Sigh...call the BLAS functions directly since we are not + using all the columns of BY...evectmatrix is not set up for + this case.) */ - /* compute S = Xt BY (i.e. all the dot products): */ - blasglue_gemm('C', 'N', X.p, d->p, X.n, - 1.0, X.data, X.p, d->BY.data, d->BY.p, 0.0, d->S2, d->p); - mpi_allreduce(d->S2, d->S, d->p * X.p * SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + /* compute S = Xt BY (i.e. all the dot products): */ + blasglue_gemm('C', 'N', X.p, d->p, X.n, 1.0, X.data, X.p, d->BY.data, d->BY.p, 0.0, d->S2, d->p); + mpi_allreduce(d->S2, d->S, d->p * X.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - /* compute X = X - Y*St = (1 - BY Yt B) X */ - blasglue_gemm('N', 'C', X.n, X.p, d->p, - -1.0, d->Y.data, d->Y.p, d->S, d->p, - 1.0, X.data, X.p); + /* compute X = X - Y*St = (1 - BY Yt B) X */ + blasglue_gemm('N', 'C', X.n, X.p, d->p, -1.0, d->Y.data, d->Y.p, d->S, d->p, 1.0, X.data, X.p); } /**************************************************************************/ /* Solve for the bands at a given k point. Must only be called after init_params! */ -void solve_kpoint(vector3 kvector) -{ - int i, total_iters = 0, ib, ib0; - real *eigvals; - real k[3]; - int flags; - deflation_data deflation; - int prev_parity; - - /* if we get too close to singular k==0 point, just set k=0 - to exploit our special handling of this k */ - if (vector3_norm(kvector) < 1e-10) - kvector.x = kvector.y = kvector.z = 0; - - mpi_one_printf("solve_kpoint (%g,%g,%g):\n", - kvector.x, kvector.y, kvector.z); - - curfield_reset(); - - if (num_bands == 0) { - mpi_one_printf(" num-bands is zero, not solving for any bands\n"); - return; - } - - if (!mdata) { - mpi_one_fprintf(stderr, - "init-params must be called before solve-kpoint!\n"); - return; - } - - /* if this is the first k point, print out a header line for - for the frequency grep data: */ - if (!kpoint_index && mpi_is_master()) { - printf("%sfreqs:, k index, k1, k2, k3, kmag/2pi", - parity_string(mdata)); - for (i = 0; i < num_bands; ++i) - printf(", %s%sband %d", - parity_string(mdata), - mdata->parity == NO_PARITY ? "" : " ", - i + 1); - printf("\n"); - } - - prev_parity = mdata->parity; - cur_kvector = kvector; - vector3_to_arr(k, kvector); - update_maxwell_data_k(mdata, k, G[0], G[1], G[2]); - CHECK(mdata->parity == prev_parity, - "k vector is incompatible with specified parity"); - - CHK_MALLOC(eigvals, real, num_bands); - - flags = eigensolver_flags; /* ctl file input variable */ - if (verbose) - flags |= EIGS_VERBOSE; - - /* constant (zero frequency) bands at k=0 are handled specially, - so remove them from the solutions for the eigensolver: */ - if (mdata->zero_k && !mtdata) { - int in, ip; - ib0 = maxwell_zero_k_num_const_bands(H, mdata); - for (in = 0; in < H.n; ++in) - for (ip = 0; ip < H.p - ib0; ++ip) - H.data[in * H.p + ip] = H.data[in * H.p + ip + ib0]; - evectmatrix_resize(&H, H.p - ib0, 1); - } - else - ib0 = 0; /* solve for all bands */ - - /* Set up deflation data: */ - if (muinvH.data != Hblock.data) { - deflation.Y = H; - deflation.BY = muinvH.data != H.data ? muinvH : H; - deflation.p = 0; - CHK_MALLOC(deflation.S, scalar, H.p * Hblock.p); - CHK_MALLOC(deflation.S2, scalar, H.p * Hblock.p); - } - - for (ib = ib0; ib < num_bands; ib += Hblock.alloc_p) { - evectconstraint_chain *constraints; - int num_iters; - - /* don't solve for too many bands if the block size doesn't divide - the number of bands: */ - if (ib + mdata->num_bands > num_bands) { - maxwell_set_num_bands(mdata, num_bands - ib); - for (i = 0; i < nwork_alloc; ++i) - evectmatrix_resize(&W[i], num_bands - ib, 0); - evectmatrix_resize(&Hblock, num_bands - ib, 0); - } - - mpi_one_printf("Solving for bands %d to %d...\n", - ib + 1, ib + Hblock.p); - - constraints = NULL; - constraints = evect_add_constraint(constraints, - maxwell_parity_constraint, - (void *) mdata); - - if (mdata->zero_k) - constraints = evect_add_constraint(constraints, - maxwell_zero_k_constraint, - (void *) mdata); - - if (Hblock.data != H.data) { /* initialize fields of block from H */ - int in, ip; - for (in = 0; in < Hblock.n; ++in) - for (ip = 0; ip < Hblock.p; ++ip) - Hblock.data[in * Hblock.p + ip] = - H.data[in * H.p + ip + (ib-ib0)]; - deflation.p = ib-ib0; - if (deflation.p > 0) { - if (deflation.BY.data != H.data) { - evectmatrix_resize(&deflation.BY, deflation.p, 0); - maxwell_muinv_operator(H, deflation.BY, (void *) mdata, - 1, deflation.BY); - } - constraints = evect_add_constraint(constraints, - deflation_constraint, - &deflation); - } - } - - if (mtdata) { /* solving for bands near a target frequency */ - CHECK(mdata->mu_inv==NULL, "targeted solver doesn't handle mu"); - if (eigensolver_davidsonp) - eigensolver_davidson( - Hblock, eigvals + ib, - maxwell_target_operator, (void *) mtdata, - simple_preconditionerp ? - maxwell_target_preconditioner : - maxwell_target_preconditioner2, - (void *) mtdata, - evectconstraint_chain_func, - (void *) constraints, - W, nwork_alloc, tolerance, &num_iters, flags, 0.0); - else - eigensolver(Hblock, eigvals + ib, - maxwell_target_operator, (void *) mtdata, - NULL, NULL, - simple_preconditionerp ? - maxwell_target_preconditioner : - maxwell_target_preconditioner2, - (void *) mtdata, - evectconstraint_chain_func, - (void *) constraints, - W, nwork_alloc, tolerance, &num_iters, flags); - - /* now, diagonalize the real Maxwell operator in the - solution subspace to get the true eigenvalues and - eigenvectors: */ - CHECK(nwork_alloc >= 2, "not enough workspace"); - eigensolver_get_eigenvals(Hblock, eigvals + ib, - maxwell_operator,mdata, W[0],W[1]); - } - else { - if (eigensolver_davidsonp) { - CHECK(mdata->mu_inv==NULL, "Davidson doesn't handle mu"); - eigensolver_davidson( - Hblock, eigvals + ib, - maxwell_operator, (void *) mdata, - simple_preconditionerp ? - maxwell_preconditioner : - maxwell_preconditioner2, - (void *) mdata, - evectconstraint_chain_func, - (void *) constraints, - W, nwork_alloc, tolerance, &num_iters, flags, 0.0); - } - else - eigensolver(Hblock, eigvals + ib, - maxwell_operator, (void *) mdata, - mdata->mu_inv ? maxwell_muinv_operator : NULL, - (void *) mdata, - simple_preconditionerp ? - maxwell_preconditioner : - maxwell_preconditioner2, - (void *) mdata, - evectconstraint_chain_func, - (void *) constraints, - W, nwork_alloc, tolerance, &num_iters, flags); - } - - if (Hblock.data != H.data) { /* save solutions of current block */ - int in, ip; - for (in = 0; in < Hblock.n; ++in) - for (ip = 0; ip < Hblock.p; ++ip) - H.data[in * H.p + ip + (ib-ib0)] = - Hblock.data[in * Hblock.p + ip]; - } - - evect_destroy_constraints(constraints); - - mpi_one_printf("Finished solving for bands %d to %d after " - "%d iterations.\n", ib + 1, ib + Hblock.p, num_iters); - total_iters += num_iters * Hblock.p; - } - - if (num_bands - ib0 > Hblock.alloc_p) - mpi_one_printf("Finished k-point with %g mean iterations/band.\n", - total_iters * 1.0 / num_bands); - - /* Manually put in constant (zero-frequency) solutions for k=0: */ - if (mdata->zero_k && !mtdata) { - int in, ip; - evectmatrix_resize(&H, H.alloc_p, 1); - for (in = 0; in < H.n; ++in) - for (ip = H.p - ib0 - 1; ip >= 0; --ip) - H.data[in * H.p + ip + ib0] = H.data[in * H.p + ip]; - maxwell_zero_k_set_const_bands(H, mdata); - for (ib = 0; ib < ib0; ++ib) - eigvals[ib] = 0; - } - - /* Reset scratch matrix sizes: */ - evectmatrix_resize(&Hblock, Hblock.alloc_p, 0); - for (i = 0; i < nwork_alloc; ++i) - evectmatrix_resize(&W[i], W[i].alloc_p, 0); - maxwell_set_num_bands(mdata, Hblock.alloc_p); - - /* Destroy deflation data: */ - if (H.data != Hblock.data) { - free(deflation.S2); - free(deflation.S); - } - - if (num_write_output_vars > 0) { - /* clean up from prev. call */ - destroy_output_vars(); - } - - CHK_MALLOC(parity, char, strlen(parity_string(mdata)) + 1); - parity = strcpy(parity, parity_string(mdata)); - - iterations = total_iters; /* iterations output variable */ - - /* create freqs array for storing frequencies in a Guile list */ - freqs.num_items = num_bands; - CHK_MALLOC(freqs.items, number, freqs.num_items); - - set_kpoint_index(kpoint_index + 1); - - mpi_one_printf("%sfreqs:, %d, %g, %g, %g, %g", - parity, - kpoint_index, (double)k[0], (double)k[1], (double)k[2], - vector3_norm(matrix3x3_vector3_mult(Gm, kvector))); - for (i = 0; i < num_bands; ++i) { - freqs.items[i] = - negative_epsilon_okp ? eigvals[i] : sqrt(eigvals[i]); - mpi_one_printf(", %g", freqs.items[i]); - } - mpi_one_printf("\n"); - - eigensolver_flops = evectmatrix_flops; - - free(eigvals); +void solve_kpoint(vector3 kvector) { + int i, total_iters = 0, ib, ib0; + real *eigvals; + real k[3]; + int flags; + deflation_data deflation; + int prev_parity; + + /* if we get too close to singular k==0 point, just set k=0 + to exploit our special handling of this k */ + if (vector3_norm(kvector) < 1e-10) kvector.x = kvector.y = kvector.z = 0; + + mpi_one_printf("solve_kpoint (%g,%g,%g):\n", kvector.x, kvector.y, kvector.z); + + curfield_reset(); + + if (num_bands == 0) { + mpi_one_printf(" num-bands is zero, not solving for any bands\n"); + return; + } + + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called before solve-kpoint!\n"); + return; + } + + /* if this is the first k point, print out a header line for + for the frequency grep data: */ + if (!kpoint_index && mpi_is_master()) { + printf("%sfreqs:, k index, k1, k2, k3, kmag/2pi", parity_string(mdata)); + for (i = 0; i < num_bands; ++i) + printf(", %s%sband %d", parity_string(mdata), mdata->parity == NO_PARITY ? "" : " ", i + 1); + printf("\n"); + } + + prev_parity = mdata->parity; + cur_kvector = kvector; + vector3_to_arr(k, kvector); + update_maxwell_data_k(mdata, k, G[0], G[1], G[2]); + CHECK(mdata->parity == prev_parity, "k vector is incompatible with specified parity"); + + CHK_MALLOC(eigvals, real, num_bands); + + flags = eigensolver_flags; /* ctl file input variable */ + if (verbose) flags |= EIGS_VERBOSE; + + /* constant (zero frequency) bands at k=0 are handled specially, + so remove them from the solutions for the eigensolver: */ + if (mdata->zero_k && !mtdata) { + int in, ip; + ib0 = maxwell_zero_k_num_const_bands(H, mdata); + for (in = 0; in < H.n; ++in) + for (ip = 0; ip < H.p - ib0; ++ip) + H.data[in * H.p + ip] = H.data[in * H.p + ip + ib0]; + evectmatrix_resize(&H, H.p - ib0, 1); + } + else + ib0 = 0; /* solve for all bands */ + + /* Set up deflation data: */ + if (muinvH.data != Hblock.data) { + deflation.Y = H; + deflation.BY = muinvH.data != H.data ? muinvH : H; + deflation.p = 0; + CHK_MALLOC(deflation.S, scalar, H.p * Hblock.p); + CHK_MALLOC(deflation.S2, scalar, H.p * Hblock.p); + } + + for (ib = ib0; ib < num_bands; ib += Hblock.alloc_p) { + evectconstraint_chain *constraints; + int num_iters; + + /* don't solve for too many bands if the block size doesn't divide + the number of bands: */ + if (ib + mdata->num_bands > num_bands) { + maxwell_set_num_bands(mdata, num_bands - ib); + for (i = 0; i < nwork_alloc; ++i) + evectmatrix_resize(&W[i], num_bands - ib, 0); + evectmatrix_resize(&Hblock, num_bands - ib, 0); + } + + mpi_one_printf("Solving for bands %d to %d...\n", ib + 1, ib + Hblock.p); + + constraints = NULL; + constraints = evect_add_constraint(constraints, maxwell_parity_constraint, (void *)mdata); + + if (mdata->zero_k) + constraints = evect_add_constraint(constraints, maxwell_zero_k_constraint, (void *)mdata); + + if (Hblock.data != H.data) { /* initialize fields of block from H */ + int in, ip; + for (in = 0; in < Hblock.n; ++in) + for (ip = 0; ip < Hblock.p; ++ip) + Hblock.data[in * Hblock.p + ip] = H.data[in * H.p + ip + (ib - ib0)]; + deflation.p = ib - ib0; + if (deflation.p > 0) { + if (deflation.BY.data != H.data) { + evectmatrix_resize(&deflation.BY, deflation.p, 0); + maxwell_muinv_operator(H, deflation.BY, (void *)mdata, 1, deflation.BY); + } + constraints = evect_add_constraint(constraints, deflation_constraint, &deflation); + } + } + + if (mtdata) { /* solving for bands near a target frequency */ + CHECK(mdata->mu_inv == NULL, "targeted solver doesn't handle mu"); + if (eigensolver_davidsonp) + eigensolver_davidson(Hblock, eigvals + ib, maxwell_target_operator, (void *)mtdata, + simple_preconditionerp ? maxwell_target_preconditioner + : maxwell_target_preconditioner2, + (void *)mtdata, evectconstraint_chain_func, (void *)constraints, W, + nwork_alloc, tolerance, &num_iters, flags, 0.0); + else + eigensolver(Hblock, eigvals + ib, maxwell_target_operator, (void *)mtdata, NULL, NULL, + simple_preconditionerp ? maxwell_target_preconditioner + : maxwell_target_preconditioner2, + (void *)mtdata, evectconstraint_chain_func, (void *)constraints, W, nwork_alloc, + tolerance, &num_iters, flags); + + /* now, diagonalize the real Maxwell operator in the + solution subspace to get the true eigenvalues and + eigenvectors: */ + CHECK(nwork_alloc >= 2, "not enough workspace"); + eigensolver_get_eigenvals(Hblock, eigvals + ib, maxwell_operator, mdata, W[0], W[1]); + } + else { + if (eigensolver_davidsonp) { + CHECK(mdata->mu_inv == NULL, "Davidson doesn't handle mu"); + eigensolver_davidson(Hblock, eigvals + ib, maxwell_operator, (void *)mdata, + simple_preconditionerp ? maxwell_preconditioner + : maxwell_preconditioner2, + (void *)mdata, evectconstraint_chain_func, (void *)constraints, W, + nwork_alloc, tolerance, &num_iters, flags, 0.0); + } + else + eigensolver(Hblock, eigvals + ib, maxwell_operator, (void *)mdata, + mdata->mu_inv ? maxwell_muinv_operator : NULL, (void *)mdata, + simple_preconditionerp ? maxwell_preconditioner : maxwell_preconditioner2, + (void *)mdata, evectconstraint_chain_func, (void *)constraints, W, nwork_alloc, + tolerance, &num_iters, flags); + } + + if (Hblock.data != H.data) { /* save solutions of current block */ + int in, ip; + for (in = 0; in < Hblock.n; ++in) + for (ip = 0; ip < Hblock.p; ++ip) + H.data[in * H.p + ip + (ib - ib0)] = Hblock.data[in * Hblock.p + ip]; + } + + evect_destroy_constraints(constraints); + + mpi_one_printf("Finished solving for bands %d to %d after " + "%d iterations.\n", + ib + 1, ib + Hblock.p, num_iters); + total_iters += num_iters * Hblock.p; + } + + if (num_bands - ib0 > Hblock.alloc_p) + mpi_one_printf("Finished k-point with %g mean iterations/band.\n", + total_iters * 1.0 / num_bands); + + /* Manually put in constant (zero-frequency) solutions for k=0: */ + if (mdata->zero_k && !mtdata) { + int in, ip; + evectmatrix_resize(&H, H.alloc_p, 1); + for (in = 0; in < H.n; ++in) + for (ip = H.p - ib0 - 1; ip >= 0; --ip) + H.data[in * H.p + ip + ib0] = H.data[in * H.p + ip]; + maxwell_zero_k_set_const_bands(H, mdata); + for (ib = 0; ib < ib0; ++ib) + eigvals[ib] = 0; + } + + /* Reset scratch matrix sizes: */ + evectmatrix_resize(&Hblock, Hblock.alloc_p, 0); + for (i = 0; i < nwork_alloc; ++i) + evectmatrix_resize(&W[i], W[i].alloc_p, 0); + maxwell_set_num_bands(mdata, Hblock.alloc_p); + + /* Destroy deflation data: */ + if (H.data != Hblock.data) { + free(deflation.S2); + free(deflation.S); + } + + if (num_write_output_vars > 0) { + /* clean up from prev. call */ + destroy_output_vars(); + } + + CHK_MALLOC(parity, char, strlen(parity_string(mdata)) + 1); + parity = strcpy(parity, parity_string(mdata)); + + iterations = total_iters; /* iterations output variable */ + + /* create freqs array for storing frequencies in a Guile list */ + freqs.num_items = num_bands; + CHK_MALLOC(freqs.items, number, freqs.num_items); + + set_kpoint_index(kpoint_index + 1); + + mpi_one_printf("%sfreqs:, %d, %g, %g, %g, %g", parity, kpoint_index, (double)k[0], (double)k[1], + (double)k[2], vector3_norm(matrix3x3_vector3_mult(Gm, kvector))); + for (i = 0; i < num_bands; ++i) { + freqs.items[i] = negative_epsilon_okp ? eigvals[i] : sqrt(eigvals[i]); + mpi_one_printf(", %g", freqs.items[i]); + } + mpi_one_printf("\n"); + + eigensolver_flops = evectmatrix_flops; + + free(eigvals); } /**************************************************************************/ /* Return a list of the z/y parities, one for each band. */ -number_list compute_zparities(void) -{ - number_list z_parity; - z_parity.num_items = num_bands; - z_parity.items = maxwell_zparity(H, mdata); - return z_parity; +number_list compute_zparities(void) { + number_list z_parity; + z_parity.num_items = num_bands; + z_parity.items = maxwell_zparity(H, mdata); + return z_parity; } -number_list compute_yparities(void) -{ - number_list y_parity; - y_parity.num_items = num_bands; - y_parity.items = maxwell_yparity(H, mdata); - return y_parity; +number_list compute_yparities(void) { + number_list y_parity; + y_parity.num_items = num_bands; + y_parity.items = maxwell_yparity(H, mdata); + return y_parity; } /**************************************************************************/ @@ -853,159 +751,153 @@ number_list compute_yparities(void) the length of d is ignored). d is in the reciprocal lattice basis. Should only be called after solve_kpoint. Returns a list of the group velocities, one for each band, in units of c. */ -number_list compute_group_velocity_component(vector3 d) -{ - number_list group_v; - real *gv_scratch; - real u[3]; - int i, ib; - - group_v.num_items = 0; group_v.items = (number *) NULL; - - curfield_reset(); /* has the side effect of overwriting curfield scratch */ - - if (!mdata) { - mpi_one_fprintf(stderr, "init-params must be called first!\n"); - return group_v; - } - if (!kpoint_index) { - mpi_one_fprintf(stderr, "solve-kpoint must be called first!\n"); - return group_v; - } - - /* convert d to unit vector in Cartesian coords: */ - d = unit_vector3(matrix3x3_vector3_mult(Gm, d)); - u[0] = d.x; u[1] = d.y; u[2] = d.z; - - group_v.num_items = num_bands; - CHK_MALLOC(group_v.items, number, group_v.num_items); - CHK_MALLOC(gv_scratch, real, group_v.num_items * 2); - - /* now, compute group_v.items = diag Re : */ - - /* ...we have to do this in blocks of eigensolver_block_size since - the work matrix W[0] may not have enough space to do it all at once. */ - - for (ib = 0; ib < num_bands; ib += Hblock.alloc_p) { - if (ib + mdata->num_bands > num_bands) { - maxwell_set_num_bands(mdata, num_bands - ib); - evectmatrix_resize(&W[0], num_bands - ib, 0); - evectmatrix_resize(&Hblock, num_bands - ib, 0); - } - maxwell_compute_H_from_B(mdata, H, Hblock, - (scalar_complex *) mdata->fft_data, - ib, 0, Hblock.p); - maxwell_ucross_op(Hblock, W[0], mdata, u); - evectmatrix_XtY_diag_real(Hblock, W[0], gv_scratch, - gv_scratch + group_v.num_items); - { - int ip; - for (ip = 0; ip < Hblock.p; ++ip) - group_v.items[ib + ip] = gv_scratch[ip]; - } - } - - free(gv_scratch); - - /* Reset scratch matrix sizes: */ - evectmatrix_resize(&Hblock, Hblock.alloc_p, 0); - evectmatrix_resize(&W[0], W[0].alloc_p, 0); - maxwell_set_num_bands(mdata, Hblock.alloc_p); - - /* The group velocity is given by: - - grad_k(omega)*d = grad_k(omega^2)*d / 2*omega - = grad_k()*d / 2*omega - = Re / omega - - Note that our k is in units of 2*Pi/a, and omega is in - units of 2*Pi*c/a, so the result will be in units of c. */ - for (i = 0; i < num_bands; ++i) { - if (freqs.items[i] == 0) /* v is undefined in this case */ - group_v.items[i] = 0.0; /* just set to zero */ - else - group_v.items[i] /= - negative_epsilon_okp ? sqrt(fabs(freqs.items[i])) - : freqs.items[i]; - } - - return group_v; +number_list compute_group_velocity_component(vector3 d) { + number_list group_v; + real *gv_scratch; + real u[3]; + int i, ib; + + group_v.num_items = 0; + group_v.items = (number *)NULL; + + curfield_reset(); /* has the side effect of overwriting curfield scratch */ + + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called first!\n"); + return group_v; + } + if (!kpoint_index) { + mpi_one_fprintf(stderr, "solve-kpoint must be called first!\n"); + return group_v; + } + + /* convert d to unit vector in Cartesian coords: */ + d = unit_vector3(matrix3x3_vector3_mult(Gm, d)); + u[0] = d.x; + u[1] = d.y; + u[2] = d.z; + + group_v.num_items = num_bands; + CHK_MALLOC(group_v.items, number, group_v.num_items); + CHK_MALLOC(gv_scratch, real, group_v.num_items * 2); + + /* now, compute group_v.items = diag Re : */ + + /* ...we have to do this in blocks of eigensolver_block_size since + the work matrix W[0] may not have enough space to do it all at once. */ + + for (ib = 0; ib < num_bands; ib += Hblock.alloc_p) { + if (ib + mdata->num_bands > num_bands) { + maxwell_set_num_bands(mdata, num_bands - ib); + evectmatrix_resize(&W[0], num_bands - ib, 0); + evectmatrix_resize(&Hblock, num_bands - ib, 0); + } + maxwell_compute_H_from_B(mdata, H, Hblock, (scalar_complex *)mdata->fft_data, ib, 0, Hblock.p); + maxwell_ucross_op(Hblock, W[0], mdata, u); + evectmatrix_XtY_diag_real(Hblock, W[0], gv_scratch, gv_scratch + group_v.num_items); + { + int ip; + for (ip = 0; ip < Hblock.p; ++ip) + group_v.items[ib + ip] = gv_scratch[ip]; + } + } + + free(gv_scratch); + + /* Reset scratch matrix sizes: */ + evectmatrix_resize(&Hblock, Hblock.alloc_p, 0); + evectmatrix_resize(&W[0], W[0].alloc_p, 0); + maxwell_set_num_bands(mdata, Hblock.alloc_p); + + /* The group velocity is given by: + + grad_k(omega)*d = grad_k(omega^2)*d / 2*omega + = grad_k()*d / 2*omega + = Re / omega + + Note that our k is in units of 2*Pi/a, and omega is in + units of 2*Pi*c/a, so the result will be in units of c. */ + for (i = 0; i < num_bands; ++i) { + if (freqs.items[i] == 0) /* v is undefined in this case */ + group_v.items[i] = 0.0; /* just set to zero */ + else + group_v.items[i] /= negative_epsilon_okp ? sqrt(fabs(freqs.items[i])) : freqs.items[i]; + } + + return group_v; } /* as above, but only computes for given band */ -number compute_1_group_velocity_component(vector3 d, integer b) -{ - real u[3]; - int ib = b - 1; - real group_v, scratch; - - curfield_reset(); - - if (!mdata) { - mpi_one_fprintf(stderr, "init-params must be called first!\n"); - return group_v; - } - if (!kpoint_index) { - mpi_one_fprintf(stderr, "solve-kpoint must be called first!\n"); - return group_v; - } - - /* convert d to unit vector in Cartesian coords: */ - d = unit_vector3(matrix3x3_vector3_mult(Gm, d)); - u[0] = d.x; u[1] = d.y; u[2] = d.z; - - evectmatrix_resize(&W[0], 1, 0); - CHECK(nwork_alloc > 1, "eigensolver-nwork is too small"); - evectmatrix_resize(&W[1], 1, 0); - - maxwell_compute_H_from_B(mdata, H, W[1], - (scalar_complex *) mdata->fft_data, - ib, 0, 1); - maxwell_ucross_op(W[1], W[0], mdata, u); - evectmatrix_XtY_diag_real(W[1], W[0], &group_v, &scratch); - - /* Reset scratch matrix sizes: */ - evectmatrix_resize(&W[1], W[1].alloc_p, 0); - evectmatrix_resize(&W[0], W[0].alloc_p, 0); - - if (freqs.items[ib] == 0) /* v is undefined in this case */ - group_v = 0.0; /* just set to zero */ - else - group_v /= negative_epsilon_okp ? sqrt(fabs(freqs.items[ib])) - : freqs.items[ib]; - - return group_v; +number compute_1_group_velocity_component(vector3 d, integer b) { + real u[3]; + int ib = b - 1; + real group_v, scratch; + + curfield_reset(); + + if (!mdata) { + mpi_one_fprintf(stderr, "init-params must be called first!\n"); + return group_v; + } + if (!kpoint_index) { + mpi_one_fprintf(stderr, "solve-kpoint must be called first!\n"); + return group_v; + } + + /* convert d to unit vector in Cartesian coords: */ + d = unit_vector3(matrix3x3_vector3_mult(Gm, d)); + u[0] = d.x; + u[1] = d.y; + u[2] = d.z; + + evectmatrix_resize(&W[0], 1, 0); + CHECK(nwork_alloc > 1, "eigensolver-nwork is too small"); + evectmatrix_resize(&W[1], 1, 0); + + maxwell_compute_H_from_B(mdata, H, W[1], (scalar_complex *)mdata->fft_data, ib, 0, 1); + maxwell_ucross_op(W[1], W[0], mdata, u); + evectmatrix_XtY_diag_real(W[1], W[0], &group_v, &scratch); + + /* Reset scratch matrix sizes: */ + evectmatrix_resize(&W[1], W[1].alloc_p, 0); + evectmatrix_resize(&W[0], W[0].alloc_p, 0); + + if (freqs.items[ib] == 0) /* v is undefined in this case */ + group_v = 0.0; /* just set to zero */ + else + group_v /= negative_epsilon_okp ? sqrt(fabs(freqs.items[ib])) : freqs.items[ib]; + + return group_v; } /* returns group velocity for band b, in Cartesian coordinates */ -vector3 compute_1_group_velocity(integer b) -{ - vector3 v, d; - matrix3x3 RmT = matrix3x3_transpose(Rm); - d.x = 1; d.y = d.z = 0; - v.x = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT,d),b); - d.y = 1; d.x = d.z = 0; - v.y = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT,d),b); - d.z = 1; d.y = d.x = 0; - v.z = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT,d),b); - return v; +vector3 compute_1_group_velocity(integer b) { + vector3 v, d; + matrix3x3 RmT = matrix3x3_transpose(Rm); + d.x = 1; + d.y = d.z = 0; + v.x = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT, d), b); + d.y = 1; + d.x = d.z = 0; + v.y = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT, d), b); + d.z = 1; + d.y = d.x = 0; + v.z = compute_1_group_velocity_component(matrix3x3_vector3_mult(RmT, d), b); + return v; } /* as above, but returns "group velocity" given by gradient of frequency with respect to k in reciprocal coords ... this is useful for band optimization. */ -vector3 compute_1_group_velocity_reciprocal(integer b) -{ - return matrix3x3_vector3_mult(matrix3x3_transpose(Gm), - compute_1_group_velocity(b)); +vector3 compute_1_group_velocity_reciprocal(integer b) { + return matrix3x3_vector3_mult(matrix3x3_transpose(Gm), compute_1_group_velocity(b)); } -vector3 get_dominant_planewave(integer band) -{ - double kdom[3]; - maxwell_dominant_planewave(mdata, H, band, kdom); - vector3 result = {kdom[0], kdom[1], kdom[2]}; - return result; +vector3 get_dominant_planewave(integer band) { + double kdom[3]; + maxwell_dominant_planewave(mdata, H, band, kdom); + vector3 result = {kdom[0], kdom[1], kdom[2]}; + return result; } /**************************************************************************/ diff --git a/mpb/mpb.h b/mpb/mpb.h index ce47e88e..af9d64b1 100644 --- a/mpb/mpb.h +++ b/mpb/mpb.h @@ -26,20 +26,19 @@ set when the user runs the program with --verbose */ extern int verbose; -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) #define TWOPI 6.2831853071795864769252867665590057683943388 /**************************************************************************/ -extern void get_epsilon_file_func(const char *fname, - maxwell_dielectric_function *func, - void **func_data); +extern void get_epsilon_file_func(const char *fname, maxwell_dielectric_function *func, + void **func_data); extern void destroy_epsilon_file_func_data(void *func_data); -extern real linear_interpolate(real rx, real ry, real rz, - real *data, int nx, int ny, int nz, int stride); +extern real linear_interpolate(real rx, real ry, real rz, real *data, int nx, int ny, int nz, + int stride); /**************************************************************************/ @@ -65,7 +64,7 @@ extern void curfield_reset(void); /* R[i]/G[i] are lattice/reciprocal-lattice vectors */ extern real R[3][3], G[3][3]; extern matrix3x3 Rm, Gm; /* same thing, but matrix3x3 */ -extern real Vol; /* computational cell volume = |det Rm| */ +extern real Vol; /* computational cell volume = |det Rm| */ /* index of current kpoint, for labeling output */ extern int kpoint_index; @@ -97,15 +96,13 @@ extern void init_epsilon(void); typedef enum { U_MIN = 0, U_PROD = 1, U_SUM = 2 } material_grid_kinds; extern real material_grid_val(vector3 p, const material_grid *g); -extern double matgrid_val(vector3 p, geom_box_tree tp, int oi, - const material_grid *mg); +extern double matgrid_val(vector3 p, geom_box_tree tp, int oi, const material_grid *mg); material_grid *get_material_grids(geometric_object_list g, int *ngrids); int material_grids_ntot(const material_grid *grids, int ngrids); void material_grids_set(const double *u, material_grid *grids, int ngrids); void material_grids_get(double *u, const material_grid *grids, int ngrids); -void material_grids_addgradient(double *v, - double scalegrad, int band, - const material_grid *grids, int ngrids); +void material_grids_addgradient(double *v, double scalegrad, int band, const material_grid *grids, + int ngrids); /**************************************************************************/ diff --git a/mpb/my-smob.h b/mpb/my-smob.h index 12926cbf..fd0c9862 100644 --- a/mpb/my-smob.h +++ b/mpb/my-smob.h @@ -25,44 +25,41 @@ /* Thanks to Greg Badros for posting a Guile smob tutorial; see http://sources.redhat.com/ml/guile/1999-04/msg00107.html However, this way of creating smobs no longer works as of Guile 1.4. */ -#define REGISTER_SMOBFUNS(T) \ - do { scm_tc16_smob_ ## T = scm_newsmob(& T ## _smobfuns); } while (0) -#define MAKE_SMOBFUNS(T) \ - static scm_smobfuns T ## _smobfuns = { \ - &mark_ ## T, \ - &free_ ## T, \ - &print_ ## T, 0 } +#define REGISTER_SMOBFUNS(T) \ + do { \ + scm_tc16_smob_##T = scm_newsmob(&T##_smobfuns); \ + } while (0) +#define MAKE_SMOBFUNS(T) static scm_smobfuns T##_smobfuns = {&mark_##T, &free_##T, &print_##T, 0} #endif /* ! HAVE_SCM_MAKE_SMOB_TYPE */ #ifdef HAVE_SCM_NEWSMOB -# define NEWCELL_SMOB(ANSWER,T,PSMOB) \ - SCM_NEWSMOB(ANSWER, scm_tc16_smob_ ## T, PSMOB) +#define NEWCELL_SMOB(ANSWER, T, PSMOB) SCM_NEWSMOB(ANSWER, scm_tc16_smob_##T, PSMOB) #else -# define NEWCELL_SMOB(ANSWER,T,PSMOB) do { \ - SCM_NEWCELL((ANSWER)); \ - SCM_SETCAR((ANSWER),scm_tc16_smob_ ## T); \ - SCM_SETCDR((ANSWER),(SCM) (PSMOB)); \ - } while (0) +#define NEWCELL_SMOB(ANSWER, T, PSMOB) \ + do { \ + SCM_NEWCELL((ANSWER)); \ + SCM_SETCAR((ANSWER), scm_tc16_smob_##T); \ + SCM_SETCDR((ANSWER), (SCM)(PSMOB)); \ + } while (0) #endif /* T_SMOB_P(T, X) is true iff X is an instance of the T SMOB type */ #ifdef HAVE_SCM_SMOB_PREDICATE -# define T_SMOB_P(T, X) SCM_SMOB_PREDICATE(scm_tc16_smob_ ## T, X) +#define T_SMOB_P(T, X) SCM_SMOB_PREDICATE(scm_tc16_smob_##T, X) #else -# define T_SMOB_P(T, X) (SCM_NIMP (X) \ - && SCM_TYP16 (X) == (scm_tc16_smob_ ## T)) +#define T_SMOB_P(T, X) (SCM_NIMP(X) && SCM_TYP16(X) == (scm_tc16_smob_##T)) #endif /* T_SMOB(T, X) returns the T * with the guts of the X instance; it assumes X is a T SMOB instance, and could crash if it is not */ #ifdef HAVE_SCM_SMOB_DATA -# define T_SMOB(T, X) ((T *) SCM_SMOB_DATA(X)) +#define T_SMOB(T, X) ((T *)SCM_SMOB_DATA(X)) #else -# define T_SMOB(T, X) ((T *)SCM2PTR(SCM_CDR(X))) +#define T_SMOB(T, X) ((T *)SCM2PTR(SCM_CDR(X))) #endif /* Since T_SMOB(X) can be dangerous if X is not a T object, we also have a SAFE_T_SMOB macro: */ -#define SAFE_T_SMOB(T, X) (T_SMOB_P(T,X) ? T_SMOB(T,X) : NULL) +#define SAFE_T_SMOB(T, X) (T_SMOB_P(T, X) ? T_SMOB(T, X) : NULL) #endif /* SMOB_H */ diff --git a/src/matrices/blasglue.c b/src/matrices/blasglue.c index d03301f1..14db2587 100644 --- a/src/matrices/blasglue.c +++ b/src/matrices/blasglue.c @@ -50,26 +50,26 @@ to the routine name depending upon the type defined in scalar.h */ #ifdef SCALAR_COMPLEX -# ifdef SCALAR_SINGLE_PREC -# define F(x,X) F77_FUNC(c##x, C##X) -# else -# define F(x,X) F77_FUNC(z##x, Z##X) -# endif +#ifdef SCALAR_SINGLE_PREC +#define F(x, X) F77_FUNC(c##x, C##X) #else -# ifdef SCALAR_SINGLE_PREC -# define F(x,X) F77_FUNC(s##x, S##X) -# else -# define F(x,X) F77_FUNC(d##x, D##X) -# endif +#define F(x, X) F77_FUNC(z##x, Z##X) +#endif +#else +#ifdef SCALAR_SINGLE_PREC +#define F(x, X) F77_FUNC(s##x, S##X) +#else +#define F(x, X) F77_FUNC(d##x, D##X) +#endif #endif /* FR(x,X) is for functions where we always pass real arguments, even when SCALAR_COMPLEX is defined. */ #ifdef SCALAR_SINGLE_PREC -# define FR(x,X) F77_FUNC(s##x, S##X) +#define FR(x, X) F77_FUNC(s##x, S##X) #else -# define FR(x,X) F77_FUNC(d##x, D##X) +#define FR(x, X) F77_FUNC(d##x, D##X) #endif /*************************************************************************/ @@ -81,163 +81,139 @@ #ifdef __cplusplus extern "C" { -#endif /* __cplusplus */ - -extern void F(axpy,AXPY) (int *, scalar *, scalar *, int *, scalar *, int *); -extern void F(scal,SCAL) (int *, scalar *, scalar *, int *); -extern void F(copy,COPY) (int *, scalar *, int *, scalar *, int *); -extern scalar F(dotc,DOTC) (int *, scalar *, int *, scalar *, int *); -extern scalar F(dot,DOT) (int *, scalar *, int *, scalar *, int *); -extern void F(gemm,GEMM) (char *, char *, int *, int *, int *, - scalar *, scalar *, int *, scalar *, int *, - scalar *, scalar *, int *); -extern void F(herk,HERK) (char *, char *, int *, int *, - real *, scalar *, int *, - real *, scalar *, int *); -extern void F(syrk,SYRK) (char *, char *, int *, int *, - real *, scalar *, int *, - real *, scalar *, int *); -extern void F(potrf,POTRF) (char *, int *, scalar *, int *, int *); -extern void F(potri,POTRI) (char *, int *, scalar *, int *, int *); -extern void F(hetrf,HETRF) (char *, int *, scalar *, int *, - int *, scalar *, int *, int *); -extern void F(hetri,HETRI) (char *, int *, scalar *, int *, - int *, scalar *, int *); -extern void F(sytrf,SYTRF) (char *, int *, scalar *, int *, - int *, scalar *, int *, int *); -extern void F(sytri,SYTRI) (char *, int *, scalar *, int *, - int *, scalar *, int *); -extern void F(heev,HEEV) (char *, char *, int *, scalar *, int *, real *, - scalar *, int *, real *, int *); -extern void FR(syev,SYEV) (char *, char *, int *, real *, int *, real *, - real *, int *, int *); -extern void F(hegv,HEGV) (int *, char *, char *, int *, scalar *, int *, scalar *, int *, real *, scalar *, int *, real *, int *); -extern void FR(sygv,SYGV) (int *, char *, char *, int *, real *, int *, real *, int *, real *, real *, int *, int *); +#endif /* __cplusplus */ + +extern void F(axpy, AXPY)(int *, scalar *, scalar *, int *, scalar *, int *); +extern void F(scal, SCAL)(int *, scalar *, scalar *, int *); +extern void F(copy, COPY)(int *, scalar *, int *, scalar *, int *); +extern scalar F(dotc, DOTC)(int *, scalar *, int *, scalar *, int *); +extern scalar F(dot, DOT)(int *, scalar *, int *, scalar *, int *); +extern void F(gemm, GEMM)(char *, char *, int *, int *, int *, scalar *, scalar *, int *, scalar *, + int *, scalar *, scalar *, int *); +extern void F(herk, HERK)(char *, char *, int *, int *, real *, scalar *, int *, real *, scalar *, + int *); +extern void F(syrk, SYRK)(char *, char *, int *, int *, real *, scalar *, int *, real *, scalar *, + int *); +extern void F(potrf, POTRF)(char *, int *, scalar *, int *, int *); +extern void F(potri, POTRI)(char *, int *, scalar *, int *, int *); +extern void F(hetrf, HETRF)(char *, int *, scalar *, int *, int *, scalar *, int *, int *); +extern void F(hetri, HETRI)(char *, int *, scalar *, int *, int *, scalar *, int *); +extern void F(sytrf, SYTRF)(char *, int *, scalar *, int *, int *, scalar *, int *, int *); +extern void F(sytri, SYTRI)(char *, int *, scalar *, int *, int *, scalar *, int *); +extern void F(heev, HEEV)(char *, char *, int *, scalar *, int *, real *, scalar *, int *, real *, + int *); +extern void FR(syev, SYEV)(char *, char *, int *, real *, int *, real *, real *, int *, int *); +extern void F(hegv, HEGV)(int *, char *, char *, int *, scalar *, int *, scalar *, int *, real *, + scalar *, int *, real *, int *); +extern void FR(sygv, SYGV)(int *, char *, char *, int *, real *, int *, real *, int *, real *, + real *, int *, int *); #ifdef SCALAR_COMPLEX -extern void F(geev,GEEV) (char *jobvl, char *jobvr, int *n, - scalar *A, int *lda, scalar *w, - scalar *VL, int *ldvl, scalar *VR, int *ldvr, - scalar *work, int *lwork, real *rwork, int *info); +extern void F(geev, GEEV)(char *jobvl, char *jobvr, int *n, scalar *A, int *lda, scalar *w, + scalar *VL, int *ldvl, scalar *VR, int *ldvr, scalar *work, int *lwork, + real *rwork, int *info); #else -extern void FR(geev,GEEV) (char *jobvl, char *jobvr, int *n, - scalar *A, int *lda, real *wr, real *wi, - scalar *VL, int *ldvl, scalar *VR, int *ldvr, - scalar *work, int *lwork, int *info); +extern void FR(geev, GEEV)(char *jobvl, char *jobvr, int *n, scalar *A, int *lda, real *wr, + real *wi, scalar *VL, int *ldvl, scalar *VR, int *ldvr, scalar *work, + int *lwork, int *info); #endif #ifdef __cplusplus0 -} /* extern "C" */ -#endif /* __cplusplus */ +} /* extern "C" */ +#endif /* __cplusplus */ /*************************************************************************/ -void blasglue_axpy(int n, real a, scalar *x, int incx, scalar *y, int incy) -{ - scalar alpha; +void blasglue_axpy(int n, real a, scalar *x, int incx, scalar *y, int incy) { + scalar alpha; - ASSIGN_REAL(alpha, a); + ASSIGN_REAL(alpha, a); - F(axpy,AXPY) (&n, &alpha, x, &incx, y, &incy); + F(axpy, AXPY)(&n, &alpha, x, &incx, y, &incy); } -void blasglue_scal(int n, scalar a, scalar *x, int incx) -{ - F(scal,SCAL) (&n, &a, x, &incx); -} +void blasglue_scal(int n, scalar a, scalar *x, int incx) { F(scal, SCAL)(&n, &a, x, &incx); } -void blasglue_rscal(int n, real a, scalar *x, int incx) -{ - scalar alpha; +void blasglue_rscal(int n, real a, scalar *x, int incx) { + scalar alpha; - ASSIGN_REAL(alpha, a); + ASSIGN_REAL(alpha, a); - F(scal,SCAL) (&n, &alpha, x, &incx); + F(scal, SCAL)(&n, &alpha, x, &incx); } -void blasglue_copy(int n, scalar *x, int incx, scalar *y, int incy) -{ - F(copy,COPY) (&n, x, &incx, y, &incy); +void blasglue_copy(int n, scalar *x, int incx, scalar *y, int incy) { + F(copy, COPY)(&n, x, &incx, y, &incy); } -scalar blasglue_dotc(int n, scalar *x, int incx, scalar *y, int incy) -{ +scalar blasglue_dotc(int n, scalar *x, int incx, scalar *y, int incy) { #ifndef NO_FORTRAN_FUNCTIONS -# ifdef SCALAR_COMPLEX - return (F(dotc,DOTC) (&n, x, &incx, y, &incy)); -# else - return (F(dot,DOT) (&n, x, &incx, y, &incy)); -# endif +#ifdef SCALAR_COMPLEX + return (F(dotc, DOTC)(&n, x, &incx, y, &incy)); +#else + return (F(dot, DOT)(&n, x, &incx, y, &incy)); +#endif #else /* on some machines, return values from Fortran functions don't work */ - int i; - scalar sum = SCALAR_INIT_ZERO; - for (i = 0; i < n; ++i) { -# ifdef SCALAR_COMPLEX - real x_re = x[i*incx].re, x_im = x[i*incx].im; - real y_re = y[i*incy].re, y_im = y[i*incy].im; - /* the dot product is conj(x) * y: */ - sum.re += x_re * y_re + x_im * y_im; - sum.im += x_re * y_im - x_im * y_re; -# else - sum += x[i*incx] * y[i*incy]; -# endif - } - return sum; + int i; + scalar sum = SCALAR_INIT_ZERO; + for (i = 0; i < n; ++i) { +#ifdef SCALAR_COMPLEX + real x_re = x[i * incx].re, x_im = x[i * incx].im; + real y_re = y[i * incy].re, y_im = y[i * incy].im; + /* the dot product is conj(x) * y: */ + sum.re += x_re * y_re + x_im * y_im; + sum.im += x_re * y_im - x_im * y_re; +#else + sum += x[i * incx] * y[i * incy]; +#endif + } + return sum; #endif } -void blasglue_gemm(char transa, char transb, int m, int n, int k, - real a, scalar *A, int fdA, scalar *B, int fdB, - real b, scalar *C, int fdC) -{ - scalar alpha, beta; +void blasglue_gemm(char transa, char transb, int m, int n, int k, real a, scalar *A, int fdA, + scalar *B, int fdB, real b, scalar *C, int fdC) { + scalar alpha, beta; - if (m*n == 0) - return; + if (m * n == 0) return; - if (k == 0) { - int i, j; - for (i = 0; i < m; ++i) - for (j = 0; j < n; ++j) - ASSIGN_ZERO(C[i*fdC + j]); - return; - } + if (k == 0) { + int i, j; + for (i = 0; i < m; ++i) + for (j = 0; j < n; ++j) + ASSIGN_ZERO(C[i * fdC + j]); + return; + } - CHECK(A != C && B != C, "gemm output array must be distinct"); + CHECK(A != C && B != C, "gemm output array must be distinct"); - ASSIGN_REAL(alpha,a); - ASSIGN_REAL(beta,b); + ASSIGN_REAL(alpha, a); + ASSIGN_REAL(beta, b); - F(gemm,GEMM) (&transb, &transa, &n, &m, &k, - &alpha, B, &fdB, A, &fdA, &beta, C, &fdC); + F(gemm, GEMM)(&transb, &transa, &n, &m, &k, &alpha, B, &fdB, A, &fdA, &beta, C, &fdC); } -void blasglue_herk(char uplo, char trans, int n, int k, - real a, scalar *A, int fdA, - real b, scalar *C, int fdC) -{ - if (n == 0) - return; +void blasglue_herk(char uplo, char trans, int n, int k, real a, scalar *A, int fdA, real b, + scalar *C, int fdC) { + if (n == 0) return; - if (k == 0) { - int i, j; - for (i = 0; i < n; ++i) - for (j = 0; j < n; ++j) - ASSIGN_ZERO(C[i*fdC + j]); - return; - } + if (k == 0) { + int i, j; + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) + ASSIGN_ZERO(C[i * fdC + j]); + return; + } - CHECK(A != C, "herk output array must be distinct"); + CHECK(A != C, "herk output array must be distinct"); - uplo = uplo == 'U' ? 'L' : 'U'; - trans = (trans == 'C' || trans == 'T') ? 'N' : 'C'; + uplo = uplo == 'U' ? 'L' : 'U'; + trans = (trans == 'C' || trans == 'T') ? 'N' : 'C'; #ifdef SCALAR_COMPLEX - F(herk,HERK) (&uplo, &trans, &n, &k, - &a, A, &fdA, &b, C, &fdC); + F(herk, HERK)(&uplo, &trans, &n, &k, &a, A, &fdA, &b, C, &fdC); #else - F(syrk,SYRK) (&uplo, &trans, &n, &k, - &a, A, &fdA, &b, C, &fdC); + F(syrk, SYRK)(&uplo, &trans, &n, &k, &a, A, &fdA, &b, C, &fdC); #endif } @@ -245,138 +221,124 @@ void blasglue_herk(char uplo, char trans, int n, int k, #ifndef NO_LAPACK -int lapackglue_potrf(char uplo, int n, scalar *A, int fdA) -{ - int info; +int lapackglue_potrf(char uplo, int n, scalar *A, int fdA) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; - F(potrf,POTRF) (&uplo, &n, A, &fdA, &info); + F(potrf, POTRF)(&uplo, &n, A, &fdA, &info); - CHECK(info >= 0, "invalid argument in potrf"); - return (info == 0); + CHECK(info >= 0, "invalid argument in potrf"); + return (info == 0); } -int lapackglue_potri(char uplo, int n, scalar *A, int fdA) -{ - int info; +int lapackglue_potri(char uplo, int n, scalar *A, int fdA) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; - F(potri,POTRI) (&uplo, &n, A, &fdA, &info); + F(potri, POTRI)(&uplo, &n, A, &fdA, &info); - CHECK(info >= 0, "invalid argument in potri"); - return (info == 0); + CHECK(info >= 0, "invalid argument in potri"); + return (info == 0); } -int lapackglue_hetrf(char uplo, int n, scalar *A, int fdA, - int *ipiv, scalar *work, int lwork) -{ - int info; +int lapackglue_hetrf(char uplo, int n, scalar *A, int fdA, int *ipiv, scalar *work, int lwork) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; #ifdef SCALAR_COMPLEX - F(hetrf,HETRF) (&uplo, &n, A, &fdA, ipiv, work, &lwork, &info); + F(hetrf, HETRF)(&uplo, &n, A, &fdA, ipiv, work, &lwork, &info); #else - F(sytrf,SYTRF) (&uplo, &n, A, &fdA, ipiv, work, &lwork, &info); + F(sytrf, SYTRF)(&uplo, &n, A, &fdA, ipiv, work, &lwork, &info); #endif - CHECK(info >= 0, "invalid argument in hetrf"); - return (info == 0); + CHECK(info >= 0, "invalid argument in hetrf"); + return (info == 0); } -int lapackglue_hetri(char uplo, int n, scalar *A, int fdA, - int *ipiv, scalar *work) -{ - int info; +int lapackglue_hetri(char uplo, int n, scalar *A, int fdA, int *ipiv, scalar *work) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; #ifdef SCALAR_COMPLEX - F(hetri,HETRI) (&uplo, &n, A, &fdA, ipiv, work, &info); + F(hetri, HETRI)(&uplo, &n, A, &fdA, ipiv, work, &info); #else - F(sytri,SYTRI) (&uplo, &n, A, &fdA, ipiv, work, &info); + F(sytri, SYTRI)(&uplo, &n, A, &fdA, ipiv, work, &info); #endif - CHECK(info >= 0, "invalid argument in hetri"); - return (info == 0); + CHECK(info >= 0, "invalid argument in hetri"); + return (info == 0); } -void lapackglue_heev(char jobz, char uplo, int n, scalar *A, int fdA, - real *w, scalar *work, int lwork, real *rwork) -{ - int info; +void lapackglue_heev(char jobz, char uplo, int n, scalar *A, int fdA, real *w, scalar *work, + int lwork, real *rwork) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; #ifdef SCALAR_COMPLEX - F(heev,HEEV) (&jobz, &uplo, &n, A, &fdA, w, work, &lwork, rwork, &info); + F(heev, HEEV)(&jobz, &uplo, &n, A, &fdA, w, work, &lwork, rwork, &info); #else - (void) rwork; /* unused */ - F(syev,SYEV) (&jobz, &uplo, &n, A, &fdA, w, work, &lwork, &info); + (void)rwork; /* unused */ + F(syev, SYEV)(&jobz, &uplo, &n, A, &fdA, w, work, &lwork, &info); #endif - CHECK(info >= 0, "invalid argument in heev"); - CHECK(info <= 0, "failure to converge in heev"); + CHECK(info >= 0, "invalid argument in heev"); + CHECK(info <= 0, "failure to converge in heev"); } -void lapackglue_geev(char jobvl, char jobvr, int n, - scalar *A, int fdA, scalar_complex *w, - scalar *VL, int fdVL, scalar *VR, int fdVR, - scalar *work, int lwork, real *rwork) -{ - int info; +void lapackglue_geev(char jobvl, char jobvr, int n, scalar *A, int fdA, scalar_complex *w, + scalar *VL, int fdVL, scalar *VR, int fdVR, scalar *work, int lwork, + real *rwork) { + int info; #ifdef SCALAR_COMPLEX - F(geev,GEEV) (&jobvl, &jobvr, &n, A, &fdA, w, VL, &fdVL, VR, &fdVR, - work, &lwork, rwork, &info); + F(geev, GEEV)(&jobvl, &jobvr, &n, A, &fdA, w, VL, &fdVL, VR, &fdVR, work, &lwork, rwork, &info); #else - int i; - real *wr, *wi; - CHK_MALLOC(wr, real, 2*n); - wi = wr + n; - (void) rwork; /* unused */ - FR(geev,GEEV) (&jobvl, &jobvr, &n, A, &fdA, wr, wi, VL, &fdVL, VR, &fdVR, - work, &lwork, &info); - for (i = 0; i < n; ++i) - CASSIGN_SCALAR(w[i], wr[i], wi[i]); - free(wr); + int i; + real *wr, *wi; + CHK_MALLOC(wr, real, 2 * n); + wi = wr + n; + (void)rwork; /* unused */ + FR(geev, GEEV)(&jobvl, &jobvr, &n, A, &fdA, wr, wi, VL, &fdVL, VR, &fdVR, work, &lwork, &info); + for (i = 0; i < n; ++i) + CASSIGN_SCALAR(w[i], wr[i], wi[i]); + free(wr); #endif - CHECK(info >= 0, "invalid argument in geev"); - CHECK(info <= 0, "failure to converge in geev"); + CHECK(info >= 0, "invalid argument in geev"); + CHECK(info <= 0, "failure to converge in geev"); } -void lapackglue_hegv(int itype, char jobz, char uplo, int n, - scalar *A, int fdA, scalar *B, int fdB, - real *w, scalar *work, int lwork, real *rwork) -{ - int info; +void lapackglue_hegv(int itype, char jobz, char uplo, int n, scalar *A, int fdA, scalar *B, int fdB, + real *w, scalar *work, int lwork, real *rwork) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; #ifdef SCALAR_COMPLEX - F(hegv,HEGV) (&itype, &jobz, &uplo, &n, A, &fdA, B, &fdB, w, work, &lwork, rwork, &info); + F(hegv, HEGV)(&itype, &jobz, &uplo, &n, A, &fdA, B, &fdB, w, work, &lwork, rwork, &info); #else - (void) rwork; /* unused */ - F(sygv,SYGV) (&itype, &jobz, &uplo, &n, A, &fdA, B, &fdB, w, work, &lwork, &info); + (void)rwork; /* unused */ + F(sygv, SYGV)(&itype, &jobz, &uplo, &n, A, &fdA, B, &fdB, w, work, &lwork, &info); #endif - CHECK(info >= 0, "invalid argument in hegv"); - CHECK(info <= 0, "failure to converge in hegv"); + CHECK(info >= 0, "invalid argument in hegv"); + CHECK(info <= 0, "failure to converge in hegv"); } -void lapackglue_syev(char jobz, char uplo, int n, real *A, int fdA, - real *w, real *work, int lwork) -{ - int info; +void lapackglue_syev(char jobz, char uplo, int n, real *A, int fdA, real *w, real *work, + int lwork) { + int info; - uplo = uplo == 'U' ? 'L' : 'U'; + uplo = uplo == 'U' ? 'L' : 'U'; - FR(syev,SYEV) (&jobz, &uplo, &n, A, &fdA, w, work, &lwork, &info); + FR(syev, SYEV)(&jobz, &uplo, &n, A, &fdA, w, work, &lwork, &info); - CHECK(info >= 0, "invalid argument in heev"); - CHECK(info <= 0, "failure to converge in heev"); + CHECK(info >= 0, "invalid argument in heev"); + CHECK(info <= 0, "failure to converge in heev"); } #endif diff --git a/src/matrices/blasglue.h b/src/matrices/blasglue.h index 1105bd8d..c114f0b9 100644 --- a/src/matrices/blasglue.h +++ b/src/matrices/blasglue.h @@ -20,34 +20,28 @@ #include "scalar.h" -extern void blasglue_axpy(int n, real a, scalar *x, int incx, - scalar *y, int incy); +extern void blasglue_axpy(int n, real a, scalar *x, int incx, scalar *y, int incy); extern void blasglue_scal(int n, scalar a, scalar *x, int incx); extern void blasglue_rscal(int n, real a, scalar *x, int incx); extern void blasglue_copy(int n, scalar *x, int incx, scalar *y, int incy); extern scalar blasglue_dotc(int n, scalar *x, int incx, scalar *y, int incy); -void blasglue_gemm(char transa, char transb, int m, int n, int k, - real a, scalar *A, int fdA, scalar *B, int fdB, - real b, scalar *C, int fdC); -extern void blasglue_herk(char uplo, char trans, int n, int k, - real a, scalar *A, int fdA, - real b, scalar *C, int fdC); +void blasglue_gemm(char transa, char transb, int m, int n, int k, real a, scalar *A, int fdA, + scalar *B, int fdB, real b, scalar *C, int fdC); +extern void blasglue_herk(char uplo, char trans, int n, int k, real a, scalar *A, int fdA, real b, + scalar *C, int fdC); extern int lapackglue_potrf(char uplo, int n, scalar *A, int fdA); extern int lapackglue_potri(char uplo, int n, scalar *A, int fdA); -extern int lapackglue_hetrf(char uplo, int n, scalar *A, int fdA, - int *ipiv, scalar *work, int lwork); -extern int lapackglue_hetri(char uplo, int n, scalar *A, int fdA, - int *ipiv, scalar *work); -extern void lapackglue_heev(char jobz, char uplo, int n, scalar *A, int fdA, - real *w, scalar *work, int lwork, real *rwork); -extern void lapackglue_geev(char jobvl, char jobvr, int n, - scalar *A, int fdA, scalar_complex *w, - scalar *VL, int fdVL, scalar *VR, int fdVR, - scalar *work, int lwork, real *rwork); -extern void lapackglue_hegv(int itype, char jobz, char uplo, int n, - scalar *A, int fdA, scalar *B, int fdB, - real *w, scalar *work, int lwork, real *rwork); -extern void lapackglue_syev(char jobz, char uplo, int n, real *A, int fdA, - real *w, real *work, int lwork); +extern int lapackglue_hetrf(char uplo, int n, scalar *A, int fdA, int *ipiv, scalar *work, + int lwork); +extern int lapackglue_hetri(char uplo, int n, scalar *A, int fdA, int *ipiv, scalar *work); +extern void lapackglue_heev(char jobz, char uplo, int n, scalar *A, int fdA, real *w, scalar *work, + int lwork, real *rwork); +extern void lapackglue_geev(char jobvl, char jobvr, int n, scalar *A, int fdA, scalar_complex *w, + scalar *VL, int fdVL, scalar *VR, int fdVR, scalar *work, int lwork, + real *rwork); +extern void lapackglue_hegv(int itype, char jobz, char uplo, int n, scalar *A, int fdA, scalar *B, + int fdB, real *w, scalar *work, int lwork, real *rwork); +extern void lapackglue_syev(char jobz, char uplo, int n, real *A, int fdA, real *w, real *work, + int lwork); #endif /* BLASGLUE_H */ diff --git a/src/matrices/eigensolver.c b/src/matrices/eigensolver.c index 204b4ba9..eec0a8bc 100644 --- a/src/matrices/eigensolver.c +++ b/src/matrices/eigensolver.c @@ -31,36 +31,33 @@ #include "linmin.h" #include "verbosity.h" -extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectmatrix Work1, evectmatrix Work2, - sqmatrix U, sqmatrix Usqrt, - sqmatrix Uwork); +extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, evectoperator A, + void *Adata, evectmatrix Work1, evectmatrix Work2, + sqmatrix U, sqmatrix Usqrt, sqmatrix Uwork); #define STRINGIZEx(x) #x /* a hack so that we can stringize macro values */ #define STRINGIZE(x) STRINGIZEx(x) #define K_PI 3.141592653589793238462643383279502884197 -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) #if defined(SCALAR_LONG_DOUBLE_PREC) -# define fabs fabsl -# define cos cosl -# define sin sinl -# define sqrt sqrtl -# define atan atanl -# define atan2 atan2l +#define fabs fabsl +#define cos cosl +#define sin sinl +#define sqrt sqrtl +#define atan atanl +#define atan2 atan2l #endif /* Evalutate op, and set t to the elapsed time (in seconds). */ -#define TIME_OP(t, op) { \ - mpiglue_clock_t xxx_time_op_start_time = MPIGLUE_CLOCK; \ - { \ - op; \ - } \ - (t) = MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, xxx_time_op_start_time); \ -} +#define TIME_OP(t, op) \ + { \ + mpiglue_clock_t xxx_time_op_start_time = MPIGLUE_CLOCK; \ + { op; } \ + (t) = MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, xxx_time_op_start_time); \ + } /**************************************************************************/ @@ -81,11 +78,11 @@ extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, /* estimated times/iteration for different iteration schemes, based on the measure times for various operations and the operation counts: */ -#define EXACT_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ, t_linmin) \ - ((t_AZ)*2 + (t_KZ) + (t_ZtW)*4 + (t_ZS)*2 + (t_ZtZ)*2 + (t_linmin)) +#define EXACT_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ, t_linmin) \ + ((t_AZ)*2 + (t_KZ) + (t_ZtW)*4 + (t_ZS)*2 + (t_ZtZ)*2 + (t_linmin)) -#define APPROX_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ) \ - ((t_AZ)*2 + (t_KZ) + (t_ZtW)*2 + (t_ZS)*2 + (t_ZtZ)*2) +#define APPROX_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ) \ + ((t_AZ)*2 + (t_KZ) + (t_ZtW)*2 + (t_ZS)*2 + (t_ZtZ)*2) /* Guess for the convergence slowdown factor due to the approximate line minimization. It is probably best to be conservative, as the @@ -105,77 +102,76 @@ extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, /**************************************************************************/ typedef struct { - sqmatrix YtAY, DtAD, symYtAD, YtBY, DtBD, symYtBD, S1, S2, S3; - real lag, d_lag, trace_YtLY, trace_DtLD, trace_YtLD; + sqmatrix YtAY, DtAD, symYtAD, YtBY, DtBD, symYtBD, S1, S2, S3; + real lag, d_lag, trace_YtLY, trace_DtLD, trace_YtLD; } trace_func_data; -static linmin_real trace_func(linmin_real theta, linmin_real *trace_deriv, void *data) -{ - linmin_real trace; - trace_func_data *d = (trace_func_data *) data; - linmin_real c = cos(theta), s = sin(theta); - - { - sqmatrix_copy(d->S1, d->YtBY); - sqmatrix_aApbB(c*c, d->S1, s*s, d->DtBD); - sqmatrix_ApaB(d->S1, 2*s*c, d->symYtBD); - if (!sqmatrix_invert(d->S1, 1, d->S2)) { - /* if c or s is small, we sometimes run into numerical - difficulties, and it is better to use the Taylor expansion */ - if (c < 1e-4 * s && c != 0) { - sqmatrix_copy(d->S1, d->DtBD); - CHECK(sqmatrix_invert(d->S1, 1, d->S2), "singular DtBD!"); - sqmatrix_AeBC(d->S2, d->S1, 0, d->symYtBD, 1); - sqmatrix_AeBC(d->S3, d->S2, 0, d->S1, 1); - sqmatrix_aApbB(1/(s*s), d->S1, -2*c/(s*s*s), d->S3); - } - else if (s < 1e-4 * c && s != 0) { - sqmatrix_copy(d->S1, d->YtBY); - CHECK(sqmatrix_invert(d->S1, 1, d->S2), "singular DtBD!"); - sqmatrix_AeBC(d->S2, d->S1, 0, d->symYtBD, 1); - sqmatrix_AeBC(d->S3, d->S2, 0, d->S1, 1); - sqmatrix_aApbB(1/(c*c), d->S1, -2*s/(c*c*c), d->S3); - } - else { - CHECK(0, "inexplicable singularity in linmin trace_func"); - } - } - - sqmatrix_copy(d->S2, d->YtAY); - sqmatrix_aApbB(c*c, d->S2, s*s, d->DtAD); - sqmatrix_ApaB(d->S2, 2*s*c, d->symYtAD); - - trace = SCALAR_RE(sqmatrix_traceAtB(d->S2, d->S1)) - + (c*c * d->trace_YtLY + s*s * d->trace_DtLD - + 2*s*c * d->trace_YtLD) * (c * d->lag + s * d->d_lag); - } - - if (trace_deriv) { - linmin_real c2 = cos(2*theta), s2 = sin(2*theta); - - sqmatrix_copy(d->S3, d->YtAY); - sqmatrix_ApaB(d->S3, -1.0, d->DtAD); - sqmatrix_aApbB(-0.5 * s2, d->S3, c2, d->symYtAD); - - *trace_deriv = SCALAR_RE(sqmatrix_traceAtB(d->S1, d->S3)); - - sqmatrix_AeBC(d->S3, d->S1, 0, d->S2, 1); - sqmatrix_AeBC(d->S2, d->S3, 0, d->S1, 1); - - sqmatrix_copy(d->S3, d->YtBY); - sqmatrix_ApaB(d->S3, -1.0, d->DtBD); - sqmatrix_aApbB(-0.5 * s2, d->S3, c2, d->symYtBD); - - *trace_deriv -= SCALAR_RE(sqmatrix_traceAtB(d->S2, d->S3)); - *trace_deriv *= 2; - - *trace_deriv += (-s2 * d->trace_YtLY + s2 * d->trace_DtLD - + 2*c2 * d->trace_YtLD) * (c * d->lag + s * d->d_lag); - *trace_deriv += (c*c * d->trace_YtLY + s*s * d->trace_DtLD - + 2*s*c * d->trace_YtLD) * (-s * d->lag + c * d->d_lag); - } - - return trace; +static linmin_real trace_func(linmin_real theta, linmin_real *trace_deriv, void *data) { + linmin_real trace; + trace_func_data *d = (trace_func_data *)data; + linmin_real c = cos(theta), s = sin(theta); + + { + sqmatrix_copy(d->S1, d->YtBY); + sqmatrix_aApbB(c * c, d->S1, s * s, d->DtBD); + sqmatrix_ApaB(d->S1, 2 * s * c, d->symYtBD); + if (!sqmatrix_invert(d->S1, 1, d->S2)) { + /* if c or s is small, we sometimes run into numerical + difficulties, and it is better to use the Taylor expansion */ + if (c < 1e-4 * s && c != 0) { + sqmatrix_copy(d->S1, d->DtBD); + CHECK(sqmatrix_invert(d->S1, 1, d->S2), "singular DtBD!"); + sqmatrix_AeBC(d->S2, d->S1, 0, d->symYtBD, 1); + sqmatrix_AeBC(d->S3, d->S2, 0, d->S1, 1); + sqmatrix_aApbB(1 / (s * s), d->S1, -2 * c / (s * s * s), d->S3); + } + else if (s < 1e-4 * c && s != 0) { + sqmatrix_copy(d->S1, d->YtBY); + CHECK(sqmatrix_invert(d->S1, 1, d->S2), "singular DtBD!"); + sqmatrix_AeBC(d->S2, d->S1, 0, d->symYtBD, 1); + sqmatrix_AeBC(d->S3, d->S2, 0, d->S1, 1); + sqmatrix_aApbB(1 / (c * c), d->S1, -2 * s / (c * c * c), d->S3); + } + else { + CHECK(0, "inexplicable singularity in linmin trace_func"); + } + } + + sqmatrix_copy(d->S2, d->YtAY); + sqmatrix_aApbB(c * c, d->S2, s * s, d->DtAD); + sqmatrix_ApaB(d->S2, 2 * s * c, d->symYtAD); + + trace = SCALAR_RE(sqmatrix_traceAtB(d->S2, d->S1)) + + (c * c * d->trace_YtLY + s * s * d->trace_DtLD + 2 * s * c * d->trace_YtLD) * + (c * d->lag + s * d->d_lag); + } + + if (trace_deriv) { + linmin_real c2 = cos(2 * theta), s2 = sin(2 * theta); + + sqmatrix_copy(d->S3, d->YtAY); + sqmatrix_ApaB(d->S3, -1.0, d->DtAD); + sqmatrix_aApbB(-0.5 * s2, d->S3, c2, d->symYtAD); + + *trace_deriv = SCALAR_RE(sqmatrix_traceAtB(d->S1, d->S3)); + + sqmatrix_AeBC(d->S3, d->S1, 0, d->S2, 1); + sqmatrix_AeBC(d->S2, d->S3, 0, d->S1, 1); + + sqmatrix_copy(d->S3, d->YtBY); + sqmatrix_ApaB(d->S3, -1.0, d->DtBD); + sqmatrix_aApbB(-0.5 * s2, d->S3, c2, d->symYtBD); + + *trace_deriv -= SCALAR_RE(sqmatrix_traceAtB(d->S2, d->S3)); + *trace_deriv *= 2; + + *trace_deriv += (-s2 * d->trace_YtLY + s2 * d->trace_DtLD + 2 * c2 * d->trace_YtLD) * + (c * d->lag + s * d->d_lag); + *trace_deriv += (c * c * d->trace_YtLY + s * s * d->trace_DtLD + 2 * s * c * d->trace_YtLD) * + (-s * d->lag + c * d->d_lag); + } + + return trace; } /**************************************************************************/ @@ -195,644 +191,607 @@ static linmin_real trace_func(linmin_real theta, linmin_real *trace_deriv, void the constraint(s). */ -void eigensolver_lagrange(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectoperator B, void *Bdata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, void *constraint_data, - evectoperator L, void *Ldata, real *lag, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags) -{ - real convergence_history[EIG_HISTORY_SIZE]; - evectmatrix G, D, X, BY, prev_G, BD; - real g_lag = 0, d_lag = 0, prev_g_lag = 0; - short usingConjugateGradient = 0, use_polak_ribiere = 0, - use_linmin = 1; - real E, prev_E = 0.0; - real d_scale = 1.0; - real traceGtX, prev_traceGtX = 0.0; - real theta, prev_theta = 0.5; - int i, iteration = 0, num_emergency_restarts = 0; - mpiglue_clock_t prev_feedback_time; - real time_AZ, time_KZ=0, time_ZtZ, time_ZtW, time_ZS, time_linmin=0; - real linmin_improvement = 0; - sqmatrix YtAYU, DtAD, symYtAD, YtBY, U, DtBD, symYtBD, S1, S2, S3; - trace_func_data tfd; - - prev_feedback_time = MPIGLUE_CLOCK; +void eigensolver_lagrange(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectoperator B, void *Bdata, evectpreconditioner K, void *Kdata, + evectconstraint constraint, void *constraint_data, evectoperator L, + void *Ldata, real *lag, evectmatrix Work[], int nWork, real tolerance, + int *num_iterations, int flags) { + real convergence_history[EIG_HISTORY_SIZE]; + evectmatrix G, D, X, BY, prev_G, BD; + real g_lag = 0, d_lag = 0, prev_g_lag = 0; + short usingConjugateGradient = 0, use_polak_ribiere = 0, use_linmin = 1; + real E, prev_E = 0.0; + real d_scale = 1.0; + real traceGtX, prev_traceGtX = 0.0; + real theta, prev_theta = 0.5; + int i, iteration = 0, num_emergency_restarts = 0; + mpiglue_clock_t prev_feedback_time; + real time_AZ, time_KZ = 0, time_ZtZ, time_ZtW, time_ZS, time_linmin = 0; + real linmin_improvement = 0; + sqmatrix YtAYU, DtAD, symYtAD, YtBY, U, DtBD, symYtBD, S1, S2, S3; + trace_func_data tfd; + + prev_feedback_time = MPIGLUE_CLOCK; #ifdef DEBUG - flags |= EIGS_VERBOSE; + flags |= EIGS_VERBOSE; #endif - CHECK(nWork >= 2, "not enough workspace"); - G = Work[0]; - X = Work[1]; - - if (B) { - CHECK(nWork >= 3, "not enough workspace for generalized problem"); - BY = Work[2]; - } - else - BY = Y; - - usingConjugateGradient = nWork >= 3 + (B != NULL); - if (usingConjugateGradient) { - D = Work[2 + (B != NULL)]; - for (i = 0; i < D.n * D.p; ++i) - ASSIGN_ZERO(D.data[i]); - } - else - D = X; - - BD = B ? BY : D; /* storage for B*D (re-use B*Y storage) */ - - use_polak_ribiere = nWork >= 4 + (B != NULL); - if (use_polak_ribiere) { - prev_G = Work[3 + (B != NULL)]; - for (i = 0; i < Y.n * Y.p; ++i) - ASSIGN_ZERO(prev_G.data[i]); - if (flags & EIGS_ORTHOGONAL_PRECONDITIONER) /* see below */ - fprintf(stderr, "WARNING: Polak-Ribiere may not work with the " - "orthogonal-preconditioner option.\n"); - } - else - prev_G = G; - - YtAYU = create_sqmatrix(Y.p); /* holds Yt A Y */ - DtAD = create_sqmatrix(Y.p); /* holds Dt A D */ - symYtAD = create_sqmatrix(Y.p); /* holds (Yt A D + Dt A Y) / 2 */ - YtBY = create_sqmatrix(Y.p); /* holds Yt B Y */ - U = create_sqmatrix(Y.p); /* holds 1 / (Yt Y) */ - DtBD = create_sqmatrix(Y.p); /* holds Dt B D */ - symYtBD = create_sqmatrix(Y.p); /* holds (Yt B D + Dt B Y) / 2 */ - - /* Notation note: "t" represents a dagger superscript, so - Yt represents adjoint(Y), or Y' in MATLAB syntax. */ - - /* scratch matrices: */ - S1 = create_sqmatrix(Y.p); - S2 = create_sqmatrix(Y.p); - S3 = create_sqmatrix(Y.p); - - tfd.YtAY = S1; tfd.DtAD = DtAD; tfd.symYtAD = symYtAD; - tfd.YtBY = YtBY; tfd.DtBD = DtBD; tfd.symYtBD = symYtBD; - tfd.S1 = YtAYU; tfd.S2 = S2; tfd.S3 = S3; - - restartY: - - if (flags & EIGS_ORTHONORMALIZE_FIRST_STEP) { - if (B) { - B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ - evectmatrix_XtY(U, Y, BY, S2); - } - else { - evectmatrix_XtX(U, Y, S2); - } - sqmatrix_assert_hermitian(U); - CHECK(sqmatrix_invert(U, 1, S2), "non-independent initial Y"); - sqmatrix_sqrt(S1, U, S2); /* S1 = 1/sqrt(Yt*Y) */ - evectmatrix_XeYS(G, Y, S1, 1); /* G = orthonormalize Y */ - evectmatrix_copy(Y, G); - } - - for (i = 0; i < Y.p; ++i) - eigenvals[i] = 0.0; - - for (i = 0; i < EIG_HISTORY_SIZE; ++i) - convergence_history[i] = 10000.0; - - if (constraint) - constraint(Y, constraint_data); - - do { - real y_norm, gamma_numerator = 0; - - if (flags & EIGS_FORCE_APPROX_LINMIN) - use_linmin = 0; - - if (B) { - B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ - TIME_OP(time_ZtZ, evectmatrix_XtY(YtBY, Y, BY, S2)); - } - else { - TIME_OP(time_ZtZ, evectmatrix_XtX(YtBY, Y, S2)); - } - sqmatrix_assert_hermitian(YtBY); - - y_norm = sqrt(SCALAR_RE(sqmatrix_trace(YtBY)) / Y.p); - blasglue_rscal(Y.p * Y.n, 1/y_norm, Y.data, 1); - if (B) blasglue_rscal(Y.p * Y.n, 1/y_norm, BY.data, 1); - blasglue_rscal(Y.p * Y.p, 1/(y_norm*y_norm), YtBY.data, 1); - - sqmatrix_copy(U, YtBY); - if (!sqmatrix_invert(U, 1, S2)) { /* non-independent Y columns */ - /* emergency restart with random Y */ - CHECK(iteration + 10 * ++num_emergency_restarts - < EIGENSOLVER_MAX_ITERATIONS, - "too many emergency restarts"); - if (mpb_verbosity >= 2) { - mpi_one_printf(" emergency randomization of Y on iter. %d\n", - iteration); - } - for (i = 0; i < Y.p * Y.n; ++i) - ASSIGN_SCALAR(Y.data[i], - rand() * 1.0 / RAND_MAX - 0.5, - rand() * 1.0 / RAND_MAX - 0.5); - goto restartY; - } - - /* If trace(1/YtBY) gets big, it means that the columns - of Y are becoming nearly parallel. This sometimes happens, - especially in the targeted eigensolver, because the - preconditioner pushes all the columns towards the ground - state. If it gets too big, it seems to be a good idea - to re-orthogonalize, resetting conjugate-gradient, as - otherwise we start to encounter numerical problems. */ - if (flags & EIGS_REORTHOGONALIZE) { - real traceU = SCALAR_RE(sqmatrix_trace(U)); - mpi_assert_equal(traceU); - if (traceU > EIGS_TRACE_U_THRESHOLD * U.p) { - if (mpb_verbosity >= 2) { - mpi_one_printf(" re-orthonormalizing Y\n"); - } - sqmatrix_sqrt(S1, U, S2); /* S1 = 1/sqrt(Yt*Y) */ - evectmatrix_XeYS(G, Y, S1, 1); /* G = orthonormalize Y */ - evectmatrix_copy(Y, G); - prev_traceGtX = 0.0; - if (B) { - B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ - evectmatrix_XtY(YtBY, Y, BY, S2); - } - else - evectmatrix_XtX(YtBY, Y, S2); - y_norm = sqrt(SCALAR_RE(sqmatrix_trace(YtBY)) / Y.p); - blasglue_rscal(Y.p * Y.n, 1/y_norm, Y.data, 1); - if (B) blasglue_rscal(Y.p * Y.n, 1/y_norm, BY.data, 1); - blasglue_rscal(Y.p * Y.p, 1/(y_norm*y_norm), YtBY.data, 1); - sqmatrix_copy(U, YtBY); - CHECK(sqmatrix_invert(U, 1, S2), - "non-independent Y after re-orthogonalization"); - } - } - - TIME_OP(time_AZ, A(Y, X, Adata, 1, G)); /* X = AY; G is scratch */ + CHECK(nWork >= 2, "not enough workspace"); + G = Work[0]; + X = Work[1]; + + if (B) { + CHECK(nWork >= 3, "not enough workspace for generalized problem"); + BY = Work[2]; + } + else + BY = Y; + + usingConjugateGradient = nWork >= 3 + (B != NULL); + if (usingConjugateGradient) { + D = Work[2 + (B != NULL)]; + for (i = 0; i < D.n * D.p; ++i) + ASSIGN_ZERO(D.data[i]); + } + else + D = X; + + BD = B ? BY : D; /* storage for B*D (re-use B*Y storage) */ + + use_polak_ribiere = nWork >= 4 + (B != NULL); + if (use_polak_ribiere) { + prev_G = Work[3 + (B != NULL)]; + for (i = 0; i < Y.n * Y.p; ++i) + ASSIGN_ZERO(prev_G.data[i]); + if (flags & EIGS_ORTHOGONAL_PRECONDITIONER) /* see below */ + fprintf(stderr, "WARNING: Polak-Ribiere may not work with the " + "orthogonal-preconditioner option.\n"); + } + else + prev_G = G; + + YtAYU = create_sqmatrix(Y.p); /* holds Yt A Y */ + DtAD = create_sqmatrix(Y.p); /* holds Dt A D */ + symYtAD = create_sqmatrix(Y.p); /* holds (Yt A D + Dt A Y) / 2 */ + YtBY = create_sqmatrix(Y.p); /* holds Yt B Y */ + U = create_sqmatrix(Y.p); /* holds 1 / (Yt Y) */ + DtBD = create_sqmatrix(Y.p); /* holds Dt B D */ + symYtBD = create_sqmatrix(Y.p); /* holds (Yt B D + Dt B Y) / 2 */ + + /* Notation note: "t" represents a dagger superscript, so + Yt represents adjoint(Y), or Y' in MATLAB syntax. */ + + /* scratch matrices: */ + S1 = create_sqmatrix(Y.p); + S2 = create_sqmatrix(Y.p); + S3 = create_sqmatrix(Y.p); + + tfd.YtAY = S1; + tfd.DtAD = DtAD; + tfd.symYtAD = symYtAD; + tfd.YtBY = YtBY; + tfd.DtBD = DtBD; + tfd.symYtBD = symYtBD; + tfd.S1 = YtAYU; + tfd.S2 = S2; + tfd.S3 = S3; + +restartY: + + if (flags & EIGS_ORTHONORMALIZE_FIRST_STEP) { + if (B) { + B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ + evectmatrix_XtY(U, Y, BY, S2); + } + else { + evectmatrix_XtX(U, Y, S2); + } + sqmatrix_assert_hermitian(U); + CHECK(sqmatrix_invert(U, 1, S2), "non-independent initial Y"); + sqmatrix_sqrt(S1, U, S2); /* S1 = 1/sqrt(Yt*Y) */ + evectmatrix_XeYS(G, Y, S1, 1); /* G = orthonormalize Y */ + evectmatrix_copy(Y, G); + } + + for (i = 0; i < Y.p; ++i) + eigenvals[i] = 0.0; + + for (i = 0; i < EIG_HISTORY_SIZE; ++i) + convergence_history[i] = 10000.0; + + if (constraint) constraint(Y, constraint_data); + + do { + real y_norm, gamma_numerator = 0; + + if (flags & EIGS_FORCE_APPROX_LINMIN) use_linmin = 0; + + if (B) { + B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ + TIME_OP(time_ZtZ, evectmatrix_XtY(YtBY, Y, BY, S2)); + } + else { + TIME_OP(time_ZtZ, evectmatrix_XtX(YtBY, Y, S2)); + } + sqmatrix_assert_hermitian(YtBY); + + y_norm = sqrt(SCALAR_RE(sqmatrix_trace(YtBY)) / Y.p); + blasglue_rscal(Y.p * Y.n, 1 / y_norm, Y.data, 1); + if (B) blasglue_rscal(Y.p * Y.n, 1 / y_norm, BY.data, 1); + blasglue_rscal(Y.p * Y.p, 1 / (y_norm * y_norm), YtBY.data, 1); + + sqmatrix_copy(U, YtBY); + if (!sqmatrix_invert(U, 1, S2)) { /* non-independent Y columns */ + /* emergency restart with random Y */ + CHECK(iteration + 10 * ++num_emergency_restarts < EIGENSOLVER_MAX_ITERATIONS, + "too many emergency restarts"); + if (mpb_verbosity >= 2) { + mpi_one_printf(" emergency randomization of Y on iter. %d\n", iteration); + } + for (i = 0; i < Y.p * Y.n; ++i) + ASSIGN_SCALAR(Y.data[i], rand() * 1.0 / RAND_MAX - 0.5, rand() * 1.0 / RAND_MAX - 0.5); + goto restartY; + } + + /* If trace(1/YtBY) gets big, it means that the columns + of Y are becoming nearly parallel. This sometimes happens, + especially in the targeted eigensolver, because the + preconditioner pushes all the columns towards the ground + state. If it gets too big, it seems to be a good idea + to re-orthogonalize, resetting conjugate-gradient, as + otherwise we start to encounter numerical problems. */ + if (flags & EIGS_REORTHOGONALIZE) { + real traceU = SCALAR_RE(sqmatrix_trace(U)); + mpi_assert_equal(traceU); + if (traceU > EIGS_TRACE_U_THRESHOLD * U.p) { + if (mpb_verbosity >= 2) { mpi_one_printf(" re-orthonormalizing Y\n"); } + sqmatrix_sqrt(S1, U, S2); /* S1 = 1/sqrt(Yt*Y) */ + evectmatrix_XeYS(G, Y, S1, 1); /* G = orthonormalize Y */ + evectmatrix_copy(Y, G); + prev_traceGtX = 0.0; + if (B) { + B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ + evectmatrix_XtY(YtBY, Y, BY, S2); + } + else + evectmatrix_XtX(YtBY, Y, S2); + y_norm = sqrt(SCALAR_RE(sqmatrix_trace(YtBY)) / Y.p); + blasglue_rscal(Y.p * Y.n, 1 / y_norm, Y.data, 1); + if (B) blasglue_rscal(Y.p * Y.n, 1 / y_norm, BY.data, 1); + blasglue_rscal(Y.p * Y.p, 1 / (y_norm * y_norm), YtBY.data, 1); + sqmatrix_copy(U, YtBY); + CHECK(sqmatrix_invert(U, 1, S2), "non-independent Y after re-orthogonalization"); + } + } + + TIME_OP(time_AZ, A(Y, X, Adata, 1, G)); /* X = AY; G is scratch */ #ifdef DEBUG - evectmatrix_XtY(S1, Y, X, S2); - sqmatrix_assert_hermitian(S1); + evectmatrix_XtY(S1, Y, X, S2); + sqmatrix_assert_hermitian(S1); #endif - /* G = AYU; note that U is Hermitian: */ - TIME_OP(time_ZS, evectmatrix_XeYS(G, X, U, 1)); - - TIME_OP(time_ZtW, evectmatrix_XtY(YtAYU, Y, G, S2)); - E = SCALAR_RE(sqmatrix_trace(YtAYU)); - CHECK(!BADNUM(E), "crazy number detected in trace!!\n"); - mpi_assert_equal(E); - - if (L) { - /* X = LY, no scratch */ - L(Y, X, Ldata, 1, X); - g_lag = tfd.trace_YtLY = SCALAR_RE(evectmatrix_traceXtY(Y, X)); - E += *lag * g_lag; - } - - convergence_history[iteration % EIG_HISTORY_SIZE] = - 200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E)); - - if (iteration > 0 && mpi_is_master() && - ((flags & EIGS_VERBOSE) || - MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, prev_feedback_time) - > FEEDBACK_TIME)) { - if (mpb_verbosity >= 2) { - mpi_one_printf(" iteration %4d: " - "trace = %0.16g (%g%% change)\n", iteration, (double)E, - (double)convergence_history[iteration % EIG_HISTORY_SIZE]); - } - if (flags & EIGS_VERBOSE) - debug_output_malloc_count(); - fflush(stdout); /* make sure output appears */ - prev_feedback_time = MPIGLUE_CLOCK; /* reset feedback clock */ - } - - if (iteration > 0 && - fabs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7)) - break; /* convergence! hooray! */ - - /* Compute gradient of functional: G = (1 - BY U Yt) A Y U */ - sqmatrix_AeBC(S1, U, 0, YtAYU, 0); - evectmatrix_XpaYS(G, -1.0, BY, S1, 1); - - if (L) { /* include Lagrange gradient; note X = LY from above */ - evectmatrix_aXpbY(1.0, G, *lag, X); - } - - /* set X = precondition(G): */ - if (K != NULL) { - TIME_OP(time_KZ, K(G, X, Kdata, Y, NULL, YtBY)); - /* Note: we passed NULL for eigenvals since we haven't - diagonalized YAY (nor are the Y's orthonormal). */ - } - else - evectmatrix_copy(X, G); /* preconditioner is identity */ - - /* We have to apply the constraint here, in case it doesn't - commute with the preconditioner. */ - if (constraint) - constraint(X, constraint_data); - - if (flags & EIGS_PROJECT_PRECONDITIONING) { - /* Operate projection P = (1 - BY U Yt) on X: */ - evectmatrix_XtY(symYtBD, Y, X, S2); /* symYtBD = Yt X */ - sqmatrix_AeBC(S1, U, 0, symYtBD, 0); - evectmatrix_XpaYS(X, -1.0, BY, S1, 0); - } - - /* Now, for the case of EIGS_ORTHOGONAL_PRECONDITIONER, we - need to use G as scratch space in order to avoid the need - for an extra column bundle. Before that, we need to do - any computations that we need with G. (Yes, we're - playing tricksy games here, but isn't it fun?) */ - - mpi_assert_equal(traceGtX = - SCALAR_RE(evectmatrix_traceXtY(G, X)) - + g_lag * g_lag); - if (usingConjugateGradient) { - if (use_polak_ribiere) { - /* assign G = G - prev_G and copy prev_G = G in the - same loop. We can't use the BLAS routines because - we would then need an extra n x p array. */ - for (i = 0; i < Y.n * Y.p; ++i) { - scalar g = G.data[i]; - ACCUMULATE_DIFF(G.data[i], prev_G.data[i]); - prev_G.data[i] = g; - } - gamma_numerator = SCALAR_RE(evectmatrix_traceXtY(G, X)); - - { real g = g_lag; g_lag -= prev_g_lag; prev_g_lag = g; } - gamma_numerator += g_lag * prev_g_lag; - } - else /* otherwise, use Fletcher-Reeves (ignore prev_G) */ - gamma_numerator = traceGtX; - mpi_assert_equal(gamma_numerator); - } - - /* The motivation for the following code came from a trick I - noticed in Sleijpen and Van der Vorst, "A Jacobi-Davidson - iteration method for linear eigenvalue problems," SIAM - J. Matrix Anal. Appl. 17, 401-425 (April 1996). (The - motivation in our case comes from the fact that if you - look at the Hessian matrix of the problem, it has a - projection operator just as in the above reference, and - so we should the same technique to invert it.) So far, - though, the hoped-for savings haven't materialized; maybe - we need a better preconditioner first. */ - if (flags & EIGS_ORTHOGONAL_PRECONDITIONER) { - real traceGtX_delta; /* change in traceGtX when we update X */ - - /* set G = precondition(Y): */ - if (K != NULL) - K(Y, G, Kdata, Y, NULL, YtBY); - else - evectmatrix_copy(G, Y); /* preconditioner is identity */ - - /* let X = KG - KY S3t, where S3 is chosen so that YtX = 0: - S3 = (YtKG)t / (YtKY). Recall that, at this point, - X holds KG and G holds KY. K is assumed Hermitian. */ - evectmatrix_XtY(S1, Y, G, S2); - CHECK(sqmatrix_invert(S1, 0, S2), - "singular YtKY"); /* S1 = 1 / (YtKY) */ - evectmatrix_XtY(S2, X, Y, S3); /* S2 = GtKY = (YtKG)t */ - sqmatrix_AeBC(S3, S2, 0 , S1, 1); - evectmatrix_XpaYS(X, -1.0, G, S3, 1); - - /* Update traceGtX and gamma_numerator. The update - for gamma_numerator isn't really right in the case - of Polak-Ribiere; it amounts to doing a weird combination - of P-R and Fletcher-Reeves...what will happen? (To - do the right thing, I think we would need an extra - column bundle.) */ - traceGtX_delta = -SCALAR_RE(sqmatrix_traceAtB(S3, S2)); - traceGtX += traceGtX_delta; - if (usingConjugateGradient) - gamma_numerator += traceGtX_delta; - } - - /* In conjugate-gradient, the minimization direction D is - a combination of X with the previous search directions. - Otherwise, we just have D = X. */ - - if (usingConjugateGradient) { - real gamma; - - if (prev_traceGtX == 0.0) - gamma = 0.0; - else - gamma = gamma_numerator / prev_traceGtX; - - if ((flags & EIGS_DYNAMIC_RESET_CG) && - 2.0 * convergence_history[iteration % EIG_HISTORY_SIZE] >= - convergence_history[(iteration+1) % EIG_HISTORY_SIZE]) { - gamma = 0.0; - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" dynamically resetting CG direction...\n"); - for (i = 1; i < EIG_HISTORY_SIZE; ++i) - convergence_history[(iteration+i) % EIG_HISTORY_SIZE] - = 10000.0; - } - - if ((flags & EIGS_RESET_CG) && - (iteration + 1) % CG_RESET_ITERS == 0) { - /* periodically forget previous search directions, - and just juse D = X */ - gamma = 0.0; - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" resetting CG direction...\n"); - } - - mpi_assert_equal(gamma * d_scale); - evectmatrix_aXpbY(gamma * d_scale, D, 1.0, X); - d_lag = gamma * d_scale * d_lag + g_lag; - } - - d_scale = 1.0; - - /* Minimize the trace along Y + lambda*D: */ - - if (!use_linmin) { - real dE, E2, d2E, t, d_norm; - - /* Here, we do an approximate line minimization along D - by evaluating dE (the derivative) at the current point, - and the trace E2 at a second point, and then approximating - the second derivative d2E by finite differences. Then, - we use one step of Newton's method on the derivative. - This has the advantage of requiring two fewer O(np^2) - matrix multiplications compared to the exact linmin. */ - - if (B) B(D, BD, Bdata, 0, BD); /* B*Y; no scratch */ - - d_norm = sqrt(SCALAR_RE(evectmatrix_traceXtY(BD,D)) / Y.p); - mpi_assert_equal(d_norm); - - /* dE = 2 * tr Gt D. (Use prev_G instead of G so that - it works even when we are using Polak-Ribiere.) */ - dE = 2.0 * SCALAR_RE(evectmatrix_traceXtY(prev_G, D)) / d_norm; - - /* shift Y by prev_theta along D, in the downhill direction: */ - t = dE < 0 ? -fabs(prev_theta) : fabs(prev_theta); - evectmatrix_aXpbY(1.0, Y, t / d_norm, D); - - if (B) { - B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ - evectmatrix_XtY(U, Y, BY, S2); - } - else - evectmatrix_XtX(U, Y, S2); - CHECK(sqmatrix_invert(U, 1, S2), - "singular YtBY"); /* U = 1 / (Yt B Y) */ - A(Y, G, Adata, 1, X); /* G = AY; X is scratch */ - evectmatrix_XtY(S1, Y, G, S2); /* S1 = Yt A Y */ - - E2 = SCALAR_RE(sqmatrix_traceAtB(S1, U)); - - if (L) { - *lag += (t / d_norm) * d_lag; - L(Y, X, Ldata, 1, X); - E2 += *lag * SCALAR_RE(evectmatrix_traceXtY(Y, X)); - } - - mpi_assert_equal(E2); - - /* Get finite-difference approximation for the 2nd derivative - of the trace. Equivalently, fit to a quadratic of the - form: E(theta) = E + dE theta + 1/2 d2E theta^2 */ - d2E = (E2 - E - dE * t) / (0.5 * t * t); - - theta = -dE/d2E; - - /* If the 2nd derivative is negative, or a big shift - in the trace is predicted (compared to the previous - iteration), then this approximate line minimization is - probably not very good; switch back to the exact - line minimization. Hopefully, we won't have to - abort like this very often, as it wastes operations. */ - if (d2E < 0 || -0.5*dE*theta > 20.0 * fabs(E-prev_E)) { - if (flags & EIGS_FORCE_APPROX_LINMIN) { - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" using previous stepsize\n"); - } - else { - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" switching back to exact " - "line minimization\n"); - use_linmin = 1; - evectmatrix_aXpbY(1.0, Y, -t / d_norm, D); - if (L) *lag -= (t / d_norm) * d_lag; - prev_theta = atan(prev_theta); /* convert to angle */ - /* don't do this again: */ - flags |= EIGS_FORCE_EXACT_LINMIN; - } - } - else { - /* Shift Y by theta, hopefully minimizing the trace: */ - evectmatrix_aXpbY(1.0, Y, (theta - t) / d_norm, D); - if (L) *lag += ((theta - t) / d_norm) * d_lag; - } - } - if (use_linmin) { - real dE, d2E; - - if (B) B(D, BD, Bdata, 0, G); /* B*Y; G is scratch */ - - d_scale = sqrt(SCALAR_RE(evectmatrix_traceXtY(BD, D)) / Y.p); - mpi_assert_equal(d_scale); - blasglue_rscal(Y.p * Y.n, 1/d_scale, D.data, 1); - if (B) blasglue_rscal(Y.p * Y.n, 1/d_scale, BD.data, 1); - - A(D, G, Adata, 0, X); /* G = A D; X is scratch */ - if (B) - evectmatrix_XtY(DtBD, D, BD, S2); - else - evectmatrix_XtX(DtBD, D, S2); - evectmatrix_XtY(DtAD, D, G, S2); - sqmatrix_assert_hermitian(DtBD); - sqmatrix_assert_hermitian(DtAD); - - evectmatrix_XtY(S1, Y, BD, S2); - sqmatrix_symmetrize(symYtBD, S1); - - evectmatrix_XtY(S1, Y, G, S2); - sqmatrix_symmetrize(symYtAD, S1); - - sqmatrix_AeBC(S1, U, 0, symYtBD, 1); - dE = 2.0 * (SCALAR_RE(sqmatrix_traceAtB(U, symYtAD)) - - SCALAR_RE(sqmatrix_traceAtB(YtAYU, S1))); - - sqmatrix_copy(S2, DtBD); - sqmatrix_ApaBC(S2, -4.0, symYtBD, 0, S1, 0); - sqmatrix_AeBC(S3, symYtAD, 0, S1, 0); - sqmatrix_AeBC(S1, U, 0, S2, 1); - d2E = 2.0 * (SCALAR_RE(sqmatrix_traceAtB(U, DtAD)) - - SCALAR_RE(sqmatrix_traceAtB(YtAYU, S1)) - - 4.0 * SCALAR_RE(sqmatrix_traceAtB(U, S3))); - - if (L) { - d_lag *= 1/d_scale; - tfd.d_lag = d_lag; - tfd.lag = *lag; - /* note: tfd.trace_YtLY was set above */ - L(D, X, Ldata, 0, X); - tfd.trace_DtLD = SCALAR_RE(evectmatrix_traceXtY(D, X)); - tfd.trace_YtLD = SCALAR_RE(evectmatrix_traceXtY(Y, X)); - dE += tfd.lag * 2.0 * tfd.trace_YtLD - + tfd.d_lag * tfd.trace_YtLY; - d2E += tfd.lag * 2.0 * tfd.trace_DtLD - + tfd.d_lag * 4.0 * tfd.trace_YtLD; - } - else { - tfd.d_lag = tfd.lag = tfd.trace_YtLY = - tfd.trace_DtLD = tfd.trace_YtLD = 0; - } - - /* this is just Newton-Raphson to find a root of - the first derivative: */ - theta = -dE/d2E; - - if (d2E < 0) { - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" near maximum in trace\n"); - theta = dE > 0 ? -fabs(prev_theta) : fabs(prev_theta); - } - else if (-0.5*dE*theta > 2.0 * fabs(E-prev_E)) { - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" large trace change predicted " - "(%g%%)\n", (double) (-0.5*dE*theta/E * 100.0)); - } - if (fabs(theta) >= K_PI) { - if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) - mpi_one_printf(" large theta (%g)\n", (double)theta); - theta = dE > 0 ? -fabs(prev_theta) : fabs(prev_theta); - } - - /* Set S1 to YtAYU * YtBY = YtAY for use in linmin. - (tfd.YtAY == S1). */ - sqmatrix_AeBC(S1, YtAYU, 0, YtBY, 1); - sqmatrix_assert_hermitian(S1); - - mpi_assert_equal(theta); - { - linmin_real new_E, new_dE; - TIME_OP(time_linmin, - theta = linmin(&new_E, &new_dE, theta, E, dE, - 0.1, MIN2(tolerance, 1e-6), 1e-14, - 0, dE > 0 ? -K_PI : K_PI, - trace_func, &tfd, - flags & EIGS_VERBOSE)); - linmin_improvement = fabs(E - new_E) * 2.0/fabs(E + new_E); - } - mpi_assert_equal(theta); - - CHECK(fabs(theta) <= K_PI, "converged out of bounds!"); - - /* Shift Y to new location minimizing the trace along D: */ - evectmatrix_aXpbY(cos(theta), Y, sin(theta), D); - if (L) *lag = *lag * cos(theta) + d_lag * sin(theta); - } - - /* In exact arithmetic, we don't need to do this, but in practice - it is probably a good idea to keep errors from adding up and - eventually violating the constraints. */ - if (constraint) - constraint(Y, constraint_data); - - prev_traceGtX = traceGtX; - prev_theta = theta; - prev_E = E; - - /* Finally, we use the times for the various operations to - help us pick an algorithm for the next iteration: */ - { - real t_exact, t_approx; - t_exact = EXACT_LINMIN_TIME(time_AZ, time_KZ, time_ZtW, - time_ZS, time_ZtZ, time_linmin); - t_approx = APPROX_LINMIN_TIME(time_AZ, time_KZ, time_ZtW, - time_ZS, time_ZtZ); - if (flags & EIGS_PROJECT_PRECONDITIONING) { - t_exact += time_ZtW + time_ZS; - t_approx += time_ZtW + time_ZS; - } - - /* Sum the times over the processors so that all the - processors compare the same, average times. */ - mpi_allreduce_1(&t_exact, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - mpi_allreduce_1(&t_approx, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - - if (!(flags & EIGS_FORCE_EXACT_LINMIN) && - linmin_improvement > 0 && - linmin_improvement <= APPROX_LINMIN_IMPROVEMENT_THRESHOLD && - t_exact > t_approx * APPROX_LINMIN_SLOWDOWN_GUESS) { - if ((flags & EIGS_VERBOSE) && use_linmin && mpb_verbosity >= 2) - mpi_one_printf(" switching to approximate " - "line minimization (decrease time by %g%%)\n", - (double) ((t_exact - t_approx) * 100.0 / t_exact)); - use_linmin = 0; - } - else if (!(flags & EIGS_FORCE_APPROX_LINMIN)) { - if ((flags & EIGS_VERBOSE) && !use_linmin && mpb_verbosity >= 2) - mpi_one_printf(" switching back to exact " - "line minimization\n"); - use_linmin = 1; - prev_theta = atan(prev_theta); /* convert to angle */ - } - } - } while (++iteration < EIGENSOLVER_MAX_ITERATIONS); - - CHECK(iteration < EIGENSOLVER_MAX_ITERATIONS, - "failure to converge after " - STRINGIZE(EIGENSOLVER_MAX_ITERATIONS) - " iterations"); - - if (B) { - B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ - evectmatrix_XtY(U, Y, BY, S2); - } - else - evectmatrix_XtX(U, Y, S2); - CHECK(sqmatrix_invert(U, 1, S2), "singular YtBY at end"); - eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, - X, G, U, S1, S2); - - *num_iterations = iteration; - - destroy_sqmatrix(S3); - destroy_sqmatrix(S2); - destroy_sqmatrix(S1); - destroy_sqmatrix(symYtBD); - destroy_sqmatrix(DtBD); - destroy_sqmatrix(U); - destroy_sqmatrix(YtBY); - destroy_sqmatrix(symYtAD); - destroy_sqmatrix(DtAD); - destroy_sqmatrix(YtAYU); + /* G = AYU; note that U is Hermitian: */ + TIME_OP(time_ZS, evectmatrix_XeYS(G, X, U, 1)); + + TIME_OP(time_ZtW, evectmatrix_XtY(YtAYU, Y, G, S2)); + E = SCALAR_RE(sqmatrix_trace(YtAYU)); + CHECK(!BADNUM(E), "crazy number detected in trace!!\n"); + mpi_assert_equal(E); + + if (L) { + /* X = LY, no scratch */ + L(Y, X, Ldata, 1, X); + g_lag = tfd.trace_YtLY = SCALAR_RE(evectmatrix_traceXtY(Y, X)); + E += *lag * g_lag; + } + + convergence_history[iteration % EIG_HISTORY_SIZE] = + 200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E)); + + if (iteration > 0 && mpi_is_master() && + ((flags & EIGS_VERBOSE) || + MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, prev_feedback_time) > FEEDBACK_TIME)) { + if (mpb_verbosity >= 2) { + mpi_one_printf(" iteration %4d: " + "trace = %0.16g (%g%% change)\n", + iteration, (double)E, + (double)convergence_history[iteration % EIG_HISTORY_SIZE]); + } + if (flags & EIGS_VERBOSE) debug_output_malloc_count(); + fflush(stdout); /* make sure output appears */ + prev_feedback_time = MPIGLUE_CLOCK; /* reset feedback clock */ + } + + if (iteration > 0 && fabs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7)) + break; /* convergence! hooray! */ + + /* Compute gradient of functional: G = (1 - BY U Yt) A Y U */ + sqmatrix_AeBC(S1, U, 0, YtAYU, 0); + evectmatrix_XpaYS(G, -1.0, BY, S1, 1); + + if (L) { /* include Lagrange gradient; note X = LY from above */ + evectmatrix_aXpbY(1.0, G, *lag, X); + } + + /* set X = precondition(G): */ + if (K != NULL) { + TIME_OP(time_KZ, K(G, X, Kdata, Y, NULL, YtBY)); + /* Note: we passed NULL for eigenvals since we haven't + diagonalized YAY (nor are the Y's orthonormal). */ + } + else + evectmatrix_copy(X, G); /* preconditioner is identity */ + + /* We have to apply the constraint here, in case it doesn't + commute with the preconditioner. */ + if (constraint) constraint(X, constraint_data); + + if (flags & EIGS_PROJECT_PRECONDITIONING) { + /* Operate projection P = (1 - BY U Yt) on X: */ + evectmatrix_XtY(symYtBD, Y, X, S2); /* symYtBD = Yt X */ + sqmatrix_AeBC(S1, U, 0, symYtBD, 0); + evectmatrix_XpaYS(X, -1.0, BY, S1, 0); + } + + /* Now, for the case of EIGS_ORTHOGONAL_PRECONDITIONER, we + need to use G as scratch space in order to avoid the need + for an extra column bundle. Before that, we need to do + any computations that we need with G. (Yes, we're + playing tricksy games here, but isn't it fun?) */ + + mpi_assert_equal(traceGtX = SCALAR_RE(evectmatrix_traceXtY(G, X)) + g_lag * g_lag); + if (usingConjugateGradient) { + if (use_polak_ribiere) { + /* assign G = G - prev_G and copy prev_G = G in the + same loop. We can't use the BLAS routines because + we would then need an extra n x p array. */ + for (i = 0; i < Y.n * Y.p; ++i) { + scalar g = G.data[i]; + ACCUMULATE_DIFF(G.data[i], prev_G.data[i]); + prev_G.data[i] = g; + } + gamma_numerator = SCALAR_RE(evectmatrix_traceXtY(G, X)); + + { + real g = g_lag; + g_lag -= prev_g_lag; + prev_g_lag = g; + } + gamma_numerator += g_lag * prev_g_lag; + } + else /* otherwise, use Fletcher-Reeves (ignore prev_G) */ + gamma_numerator = traceGtX; + mpi_assert_equal(gamma_numerator); + } + + /* The motivation for the following code came from a trick I + noticed in Sleijpen and Van der Vorst, "A Jacobi-Davidson + iteration method for linear eigenvalue problems," SIAM + J. Matrix Anal. Appl. 17, 401-425 (April 1996). (The + motivation in our case comes from the fact that if you + look at the Hessian matrix of the problem, it has a + projection operator just as in the above reference, and + so we should the same technique to invert it.) So far, + though, the hoped-for savings haven't materialized; maybe + we need a better preconditioner first. */ + if (flags & EIGS_ORTHOGONAL_PRECONDITIONER) { + real traceGtX_delta; /* change in traceGtX when we update X */ + + /* set G = precondition(Y): */ + if (K != NULL) + K(Y, G, Kdata, Y, NULL, YtBY); + else + evectmatrix_copy(G, Y); /* preconditioner is identity */ + + /* let X = KG - KY S3t, where S3 is chosen so that YtX = 0: + S3 = (YtKG)t / (YtKY). Recall that, at this point, + X holds KG and G holds KY. K is assumed Hermitian. */ + evectmatrix_XtY(S1, Y, G, S2); + CHECK(sqmatrix_invert(S1, 0, S2), "singular YtKY"); /* S1 = 1 / (YtKY) */ + evectmatrix_XtY(S2, X, Y, S3); /* S2 = GtKY = (YtKG)t */ + sqmatrix_AeBC(S3, S2, 0, S1, 1); + evectmatrix_XpaYS(X, -1.0, G, S3, 1); + + /* Update traceGtX and gamma_numerator. The update + for gamma_numerator isn't really right in the case + of Polak-Ribiere; it amounts to doing a weird combination + of P-R and Fletcher-Reeves...what will happen? (To + do the right thing, I think we would need an extra + column bundle.) */ + traceGtX_delta = -SCALAR_RE(sqmatrix_traceAtB(S3, S2)); + traceGtX += traceGtX_delta; + if (usingConjugateGradient) gamma_numerator += traceGtX_delta; + } + + /* In conjugate-gradient, the minimization direction D is + a combination of X with the previous search directions. + Otherwise, we just have D = X. */ + + if (usingConjugateGradient) { + real gamma; + + if (prev_traceGtX == 0.0) + gamma = 0.0; + else + gamma = gamma_numerator / prev_traceGtX; + + if ((flags & EIGS_DYNAMIC_RESET_CG) && + 2.0 * convergence_history[iteration % EIG_HISTORY_SIZE] >= + convergence_history[(iteration + 1) % EIG_HISTORY_SIZE]) { + gamma = 0.0; + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" dynamically resetting CG direction...\n"); + for (i = 1; i < EIG_HISTORY_SIZE; ++i) + convergence_history[(iteration + i) % EIG_HISTORY_SIZE] = 10000.0; + } + + if ((flags & EIGS_RESET_CG) && (iteration + 1) % CG_RESET_ITERS == 0) { + /* periodically forget previous search directions, + and just juse D = X */ + gamma = 0.0; + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" resetting CG direction...\n"); + } + + mpi_assert_equal(gamma * d_scale); + evectmatrix_aXpbY(gamma * d_scale, D, 1.0, X); + d_lag = gamma * d_scale * d_lag + g_lag; + } + + d_scale = 1.0; + + /* Minimize the trace along Y + lambda*D: */ + + if (!use_linmin) { + real dE, E2, d2E, t, d_norm; + + /* Here, we do an approximate line minimization along D + by evaluating dE (the derivative) at the current point, + and the trace E2 at a second point, and then approximating + the second derivative d2E by finite differences. Then, + we use one step of Newton's method on the derivative. + This has the advantage of requiring two fewer O(np^2) + matrix multiplications compared to the exact linmin. */ + + if (B) B(D, BD, Bdata, 0, BD); /* B*Y; no scratch */ + + d_norm = sqrt(SCALAR_RE(evectmatrix_traceXtY(BD, D)) / Y.p); + mpi_assert_equal(d_norm); + + /* dE = 2 * tr Gt D. (Use prev_G instead of G so that + it works even when we are using Polak-Ribiere.) */ + dE = 2.0 * SCALAR_RE(evectmatrix_traceXtY(prev_G, D)) / d_norm; + + /* shift Y by prev_theta along D, in the downhill direction: */ + t = dE < 0 ? -fabs(prev_theta) : fabs(prev_theta); + evectmatrix_aXpbY(1.0, Y, t / d_norm, D); + + if (B) { + B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ + evectmatrix_XtY(U, Y, BY, S2); + } + else + evectmatrix_XtX(U, Y, S2); + CHECK(sqmatrix_invert(U, 1, S2), "singular YtBY"); /* U = 1 / (Yt B Y) */ + A(Y, G, Adata, 1, X); /* G = AY; X is scratch */ + evectmatrix_XtY(S1, Y, G, S2); /* S1 = Yt A Y */ + + E2 = SCALAR_RE(sqmatrix_traceAtB(S1, U)); + + if (L) { + *lag += (t / d_norm) * d_lag; + L(Y, X, Ldata, 1, X); + E2 += *lag * SCALAR_RE(evectmatrix_traceXtY(Y, X)); + } + + mpi_assert_equal(E2); + + /* Get finite-difference approximation for the 2nd derivative + of the trace. Equivalently, fit to a quadratic of the + form: E(theta) = E + dE theta + 1/2 d2E theta^2 */ + d2E = (E2 - E - dE * t) / (0.5 * t * t); + + theta = -dE / d2E; + + /* If the 2nd derivative is negative, or a big shift + in the trace is predicted (compared to the previous + iteration), then this approximate line minimization is + probably not very good; switch back to the exact + line minimization. Hopefully, we won't have to + abort like this very often, as it wastes operations. */ + if (d2E < 0 || -0.5 * dE * theta > 20.0 * fabs(E - prev_E)) { + if (flags & EIGS_FORCE_APPROX_LINMIN) { + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" using previous stepsize\n"); + } + else { + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" switching back to exact " + "line minimization\n"); + use_linmin = 1; + evectmatrix_aXpbY(1.0, Y, -t / d_norm, D); + if (L) *lag -= (t / d_norm) * d_lag; + prev_theta = atan(prev_theta); /* convert to angle */ + /* don't do this again: */ + flags |= EIGS_FORCE_EXACT_LINMIN; + } + } + else { + /* Shift Y by theta, hopefully minimizing the trace: */ + evectmatrix_aXpbY(1.0, Y, (theta - t) / d_norm, D); + if (L) *lag += ((theta - t) / d_norm) * d_lag; + } + } + if (use_linmin) { + real dE, d2E; + + if (B) B(D, BD, Bdata, 0, G); /* B*Y; G is scratch */ + + d_scale = sqrt(SCALAR_RE(evectmatrix_traceXtY(BD, D)) / Y.p); + mpi_assert_equal(d_scale); + blasglue_rscal(Y.p * Y.n, 1 / d_scale, D.data, 1); + if (B) blasglue_rscal(Y.p * Y.n, 1 / d_scale, BD.data, 1); + + A(D, G, Adata, 0, X); /* G = A D; X is scratch */ + if (B) + evectmatrix_XtY(DtBD, D, BD, S2); + else + evectmatrix_XtX(DtBD, D, S2); + evectmatrix_XtY(DtAD, D, G, S2); + sqmatrix_assert_hermitian(DtBD); + sqmatrix_assert_hermitian(DtAD); + + evectmatrix_XtY(S1, Y, BD, S2); + sqmatrix_symmetrize(symYtBD, S1); + + evectmatrix_XtY(S1, Y, G, S2); + sqmatrix_symmetrize(symYtAD, S1); + + sqmatrix_AeBC(S1, U, 0, symYtBD, 1); + dE = 2.0 * + (SCALAR_RE(sqmatrix_traceAtB(U, symYtAD)) - SCALAR_RE(sqmatrix_traceAtB(YtAYU, S1))); + + sqmatrix_copy(S2, DtBD); + sqmatrix_ApaBC(S2, -4.0, symYtBD, 0, S1, 0); + sqmatrix_AeBC(S3, symYtAD, 0, S1, 0); + sqmatrix_AeBC(S1, U, 0, S2, 1); + d2E = 2.0 * (SCALAR_RE(sqmatrix_traceAtB(U, DtAD)) - SCALAR_RE(sqmatrix_traceAtB(YtAYU, S1)) - + 4.0 * SCALAR_RE(sqmatrix_traceAtB(U, S3))); + + if (L) { + d_lag *= 1 / d_scale; + tfd.d_lag = d_lag; + tfd.lag = *lag; + /* note: tfd.trace_YtLY was set above */ + L(D, X, Ldata, 0, X); + tfd.trace_DtLD = SCALAR_RE(evectmatrix_traceXtY(D, X)); + tfd.trace_YtLD = SCALAR_RE(evectmatrix_traceXtY(Y, X)); + dE += tfd.lag * 2.0 * tfd.trace_YtLD + tfd.d_lag * tfd.trace_YtLY; + d2E += tfd.lag * 2.0 * tfd.trace_DtLD + tfd.d_lag * 4.0 * tfd.trace_YtLD; + } + else { + tfd.d_lag = tfd.lag = tfd.trace_YtLY = tfd.trace_DtLD = tfd.trace_YtLD = 0; + } + + /* this is just Newton-Raphson to find a root of + the first derivative: */ + theta = -dE / d2E; + + if (d2E < 0) { + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" near maximum in trace\n"); + theta = dE > 0 ? -fabs(prev_theta) : fabs(prev_theta); + } + else if (-0.5 * dE * theta > 2.0 * fabs(E - prev_E)) { + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" large trace change predicted " + "(%g%%)\n", + (double)(-0.5 * dE * theta / E * 100.0)); + } + if (fabs(theta) >= K_PI) { + if (flags & EIGS_VERBOSE && mpb_verbosity >= 2) + mpi_one_printf(" large theta (%g)\n", (double)theta); + theta = dE > 0 ? -fabs(prev_theta) : fabs(prev_theta); + } + + /* Set S1 to YtAYU * YtBY = YtAY for use in linmin. + (tfd.YtAY == S1). */ + sqmatrix_AeBC(S1, YtAYU, 0, YtBY, 1); + sqmatrix_assert_hermitian(S1); + + mpi_assert_equal(theta); + { + linmin_real new_E, new_dE; + TIME_OP(time_linmin, + theta = linmin(&new_E, &new_dE, theta, E, dE, 0.1, MIN2(tolerance, 1e-6), 1e-14, 0, + dE > 0 ? -K_PI : K_PI, trace_func, &tfd, flags & EIGS_VERBOSE)); + linmin_improvement = fabs(E - new_E) * 2.0 / fabs(E + new_E); + } + mpi_assert_equal(theta); + + CHECK(fabs(theta) <= K_PI, "converged out of bounds!"); + + /* Shift Y to new location minimizing the trace along D: */ + evectmatrix_aXpbY(cos(theta), Y, sin(theta), D); + if (L) *lag = *lag * cos(theta) + d_lag * sin(theta); + } + + /* In exact arithmetic, we don't need to do this, but in practice + it is probably a good idea to keep errors from adding up and + eventually violating the constraints. */ + if (constraint) constraint(Y, constraint_data); + + prev_traceGtX = traceGtX; + prev_theta = theta; + prev_E = E; + + /* Finally, we use the times for the various operations to + help us pick an algorithm for the next iteration: */ + { + real t_exact, t_approx; + t_exact = EXACT_LINMIN_TIME(time_AZ, time_KZ, time_ZtW, time_ZS, time_ZtZ, time_linmin); + t_approx = APPROX_LINMIN_TIME(time_AZ, time_KZ, time_ZtW, time_ZS, time_ZtZ); + if (flags & EIGS_PROJECT_PRECONDITIONING) { + t_exact += time_ZtW + time_ZS; + t_approx += time_ZtW + time_ZS; + } + + /* Sum the times over the processors so that all the + processors compare the same, average times. */ + mpi_allreduce_1(&t_exact, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + mpi_allreduce_1(&t_approx, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + + if (!(flags & EIGS_FORCE_EXACT_LINMIN) && linmin_improvement > 0 && + linmin_improvement <= APPROX_LINMIN_IMPROVEMENT_THRESHOLD && + t_exact > t_approx * APPROX_LINMIN_SLOWDOWN_GUESS) { + if ((flags & EIGS_VERBOSE) && use_linmin && mpb_verbosity >= 2) + mpi_one_printf(" switching to approximate " + "line minimization (decrease time by %g%%)\n", + (double)((t_exact - t_approx) * 100.0 / t_exact)); + use_linmin = 0; + } + else if (!(flags & EIGS_FORCE_APPROX_LINMIN)) { + if ((flags & EIGS_VERBOSE) && !use_linmin && mpb_verbosity >= 2) + mpi_one_printf(" switching back to exact " + "line minimization\n"); + use_linmin = 1; + prev_theta = atan(prev_theta); /* convert to angle */ + } + } + } while (++iteration < EIGENSOLVER_MAX_ITERATIONS); + + CHECK(iteration < EIGENSOLVER_MAX_ITERATIONS, + "failure to converge after " STRINGIZE(EIGENSOLVER_MAX_ITERATIONS) " iterations"); + + if (B) { + B(Y, BY, Bdata, 1, G); /* B*Y; G is scratch */ + evectmatrix_XtY(U, Y, BY, S2); + } + else + evectmatrix_XtX(U, Y, S2); + CHECK(sqmatrix_invert(U, 1, S2), "singular YtBY at end"); + eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, X, G, U, S1, S2); + + *num_iterations = iteration; + + destroy_sqmatrix(S3); + destroy_sqmatrix(S2); + destroy_sqmatrix(S1); + destroy_sqmatrix(symYtBD); + destroy_sqmatrix(DtBD); + destroy_sqmatrix(U); + destroy_sqmatrix(YtBY); + destroy_sqmatrix(symYtAD); + destroy_sqmatrix(DtAD); + destroy_sqmatrix(YtAYU); } -void eigensolver(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectoperator B, void *Bdata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, void *constraint_data, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags) -{ - eigensolver_lagrange(Y, eigenvals, A, Adata, B, Bdata, K, Kdata, - constraint, constraint_data, - 0, 0, 0, - Work, nWork, tolerance, num_iterations, flags); +void eigensolver(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, evectoperator B, + void *Bdata, evectpreconditioner K, void *Kdata, evectconstraint constraint, + void *constraint_data, evectmatrix Work[], int nWork, real tolerance, + int *num_iterations, int flags) { + eigensolver_lagrange(Y, eigenvals, A, Adata, B, Bdata, K, Kdata, constraint, constraint_data, 0, + 0, 0, Work, nWork, tolerance, num_iterations, flags); } diff --git a/src/matrices/eigensolver.h b/src/matrices/eigensolver.h index 69a48da2..0a729319 100644 --- a/src/matrices/eigensolver.h +++ b/src/matrices/eigensolver.h @@ -19,7 +19,7 @@ #define EIGENSOLVER_H #ifdef MPB_REAL -# define real mpb_real +#define real mpb_real #endif #include "matrices.h" @@ -28,84 +28,65 @@ extern "C" { #endif /* __cplusplus */ -typedef void (*evectoperator) (evectmatrix Xin, evectmatrix Xout, - void *data, int is_current_eigenvector, - evectmatrix Work); - -typedef void (*evectpreconditioner) (evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY); - -typedef void (*evectconstraint) (evectmatrix X, void *data); - -extern void eigensolver(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectoperator B, void *Bdata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, void *constraint_data, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags); - -extern void eigensolver_lagrange(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectoperator B, void *Bdata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, void *constraint_data, - evectoperator L, void *Ldata, real *lag, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags); - -extern void eigensolver_davidson(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, - void *constraint_data, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags, - real target); - -extern void eigensolver_get_eigenvals(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectmatrix Work1, evectmatrix Work2); +typedef void (*evectoperator)(evectmatrix Xin, evectmatrix Xout, void *data, + int is_current_eigenvector, evectmatrix Work); + +typedef void (*evectpreconditioner)(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY); + +typedef void (*evectconstraint)(evectmatrix X, void *data); + +extern void eigensolver(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectoperator B, void *Bdata, evectpreconditioner K, void *Kdata, + evectconstraint constraint, void *constraint_data, evectmatrix Work[], + int nWork, real tolerance, int *num_iterations, int flags); + +extern void eigensolver_lagrange(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectoperator B, void *Bdata, evectpreconditioner K, void *Kdata, + evectconstraint constraint, void *constraint_data, evectoperator L, + void *Ldata, real *lag, evectmatrix Work[], int nWork, + real tolerance, int *num_iterations, int flags); + +extern void eigensolver_davidson(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectpreconditioner K, void *Kdata, evectconstraint constraint, + void *constraint_data, evectmatrix Work[], int nWork, + real tolerance, int *num_iterations, int flags, real target); + +extern void eigensolver_get_eigenvals(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectmatrix Work1, evectmatrix Work2); /* eigensolver option flags, designed to be combined with a bitwise or ('|'); each flag should set exactly one bit. */ -#define EIGS_VERBOSE (1<<0) -#define EIGS_PROJECT_PRECONDITIONING (1<<1) -#define EIGS_RESET_CG (1<<2) -#define EIGS_FORCE_EXACT_LINMIN (1<<3) -#define EIGS_FORCE_APPROX_LINMIN (1<<4) -#define EIGS_ORTHONORMALIZE_FIRST_STEP (1<<5) -#define EIGS_REORTHOGONALIZE (1<<6) -#define EIGS_DYNAMIC_RESET_CG (1<<7) -#define EIGS_ORTHOGONAL_PRECONDITIONER (1<<8) +#define EIGS_VERBOSE (1 << 0) +#define EIGS_PROJECT_PRECONDITIONING (1 << 1) +#define EIGS_RESET_CG (1 << 2) +#define EIGS_FORCE_EXACT_LINMIN (1 << 3) +#define EIGS_FORCE_APPROX_LINMIN (1 << 4) +#define EIGS_ORTHONORMALIZE_FIRST_STEP (1 << 5) +#define EIGS_REORTHOGONALIZE (1 << 6) +#define EIGS_DYNAMIC_RESET_CG (1 << 7) +#define EIGS_ORTHOGONAL_PRECONDITIONER (1 << 8) /* default flags: what we think works best most of the time: */ #define EIGS_DEFAULT_FLAGS (EIGS_RESET_CG | EIGS_REORTHOGONALIZE) typedef struct evectconstraint_chain_struct { - evectconstraint C; - void *constraint_data; - struct evectconstraint_chain_struct *next; + evectconstraint C; + void *constraint_data; + struct evectconstraint_chain_struct *next; } evectconstraint_chain; -extern evectconstraint_chain *evect_add_constraint(evectconstraint_chain - *constraints, - evectconstraint C, - void *constraint_data); +extern evectconstraint_chain *evect_add_constraint(evectconstraint_chain *constraints, + evectconstraint C, void *constraint_data); extern void evect_destroy_constraints(evectconstraint_chain *constraints); extern void evectconstraint_chain_func(evectmatrix X, void *data); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif /* __cplusplus */ #ifdef MPB_REAL -# undef real +#undef real #endif #endif /* EIGENSOLVER_H */ diff --git a/src/matrices/eigensolver_davidson.c b/src/matrices/eigensolver_davidson.c index c0397b08..fd4f136f 100644 --- a/src/matrices/eigensolver_davidson.c +++ b/src/matrices/eigensolver_davidson.c @@ -36,11 +36,9 @@ #include "eigensolver.h" #include "verbosity.h" -extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectmatrix Work1, evectmatrix Work2, - sqmatrix U, sqmatrix Usqrt, - sqmatrix Uwork); +extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, evectoperator A, + void *Adata, evectmatrix Work1, evectmatrix Work2, + sqmatrix U, sqmatrix Usqrt, sqmatrix Uwork); #define STRINGIZEx(x) #x /* a hack so that we can stringize macro values */ #define STRINGIZE(x) STRINGIZEx(x) @@ -52,197 +50,177 @@ extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, /**************************************************************************/ -void eigensolver_davidson(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectpreconditioner K, void *Kdata, - evectconstraint constraint, void *constraint_data, - evectmatrix Work[], int nWork, - real tolerance, int *num_iterations, - int flags, - real target) -{ - int nbasis, q; - evectmatrix *AV, *V; - sqmatrix VAV, S, Swork, U, S2, S3, I; - mpiglue_clock_t prev_feedback_time; - int iteration = 0, ibasis = 0; - real *eigenvals2, prev_E = 0; - - prev_feedback_time = MPIGLUE_CLOCK; +void eigensolver_davidson(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectpreconditioner K, void *Kdata, evectconstraint constraint, + void *constraint_data, evectmatrix Work[], int nWork, real tolerance, + int *num_iterations, int flags, real target) { + int nbasis, q; + evectmatrix *AV, *V; + sqmatrix VAV, S, Swork, U, S2, S3, I; + mpiglue_clock_t prev_feedback_time; + int iteration = 0, ibasis = 0; + real *eigenvals2, prev_E = 0; + + prev_feedback_time = MPIGLUE_CLOCK; #ifdef DEBUG - flags |= EIGS_VERBOSE; + flags |= EIGS_VERBOSE; #endif - CHECK(nWork >= 4, "not enough workspace"); - - nbasis = nWork / 2; - V = Work; - AV = Work + nbasis; - - q = Y.p * nbasis; - VAV = create_sqmatrix(q); - S = create_sqmatrix(q); - Swork = create_sqmatrix(q); - - sqmatrix_resize(&VAV, 0, 0); - sqmatrix_resize(&S, 0, 0); - sqmatrix_resize(&Swork, 0, 0); - - CHK_MALLOC(eigenvals2, real, q); - - U = create_sqmatrix(Y.p); - S2 = create_sqmatrix(Y.p); - S3 = create_sqmatrix(Y.p); - - I = create_sqmatrix(0); - - if (constraint) - constraint(Y, constraint_data); - - evectmatrix_XtX(U, Y, S3); - CHECK(sqmatrix_invert(U, 1, S3), "singular YtY at start"); - sqmatrix_sqrt(S2, U, S3); /* S2 = 1/sqrt(Yt*Y) */ - evectmatrix_XeYS(V[0], Y, S2, 1); /* V[0] = orthonormalize Y */ - - do { - real E; - int itarget, i; - - A(V[ibasis], AV[ibasis], Adata, 0, Y); - - q = Y.p * (ibasis + 1); - sqmatrix_resize(&VAV, q, 1); - sqmatrix_resize(&S, q, 0); - sqmatrix_resize(&Swork, q, 0); - - for (i = 0; i <= ibasis; ++i) { - evectmatrixXtY_sub(VAV, Y.p * (q * i + ibasis), - V[i], AV[ibasis], S3); - } - sqmatrix_copy_upper2full(S, VAV); - - sqmatrix_eigensolve(S, eigenvals2, Swork); - - /* find index itarget of start of "window" around the - target frequency : */ - if (target == 0.0) /* not attempting targeted eigensolver */ - itarget = 0; - else { - /* note that this technique seems to have convergence trouble */ - for (itarget = 0; itarget + Y.p < q && - fabs(target - eigenvals2[itarget]) > - fabs(target - eigenvals2[itarget + Y.p]); ++itarget) - ; - } - - for (E = 0.0, i = 0; i < Y.p; ++i) { - E += (eigenvals[i] = eigenvals2[itarget + i]); - } - mpi_assert_equal(E); - - /* compute Y = best eigenvectors */ - for (i = 0; i <= ibasis; ++i) { - evectmatrix_aXpbYS_sub(i ? 1.0 : 0.0, Y, - 1.0, V[i], - S, itarget * q + Y.p * i, 1); - } - - if (iteration > 0 && mpi_is_master() && - ((flags & EIGS_VERBOSE) || - MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, prev_feedback_time) - > FEEDBACK_TIME)) { - if (mpb_verbosity >= 2) { - printf(" iteration %4d: " - "trace = %0.16g (%g%% change)\n", iteration, E, - 200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E))); - fflush(stdout); /* make sure output appears */ - } - prev_feedback_time = MPIGLUE_CLOCK; /* reset feedback clock */ - } - - if (iteration > 0 && - fabs(E - prev_E) < tolerance * 0.5 * (fabs(E) + - fabs(prev_E) + 1e-7)) - break; /* convergence! hooray! */ - - /* compute new directions from residual & update basis: */ - { - int ibasis2 = (ibasis + 1) % nbasis; - - /* compute V[ibasis2] = AY */ + CHECK(nWork >= 4, "not enough workspace"); + + nbasis = nWork / 2; + V = Work; + AV = Work + nbasis; + + q = Y.p * nbasis; + VAV = create_sqmatrix(q); + S = create_sqmatrix(q); + Swork = create_sqmatrix(q); + + sqmatrix_resize(&VAV, 0, 0); + sqmatrix_resize(&S, 0, 0); + sqmatrix_resize(&Swork, 0, 0); + + CHK_MALLOC(eigenvals2, real, q); + + U = create_sqmatrix(Y.p); + S2 = create_sqmatrix(Y.p); + S3 = create_sqmatrix(Y.p); + + I = create_sqmatrix(0); + + if (constraint) constraint(Y, constraint_data); + + evectmatrix_XtX(U, Y, S3); + CHECK(sqmatrix_invert(U, 1, S3), "singular YtY at start"); + sqmatrix_sqrt(S2, U, S3); /* S2 = 1/sqrt(Yt*Y) */ + evectmatrix_XeYS(V[0], Y, S2, 1); /* V[0] = orthonormalize Y */ + + do { + real E; + int itarget, i; + + A(V[ibasis], AV[ibasis], Adata, 0, Y); + + q = Y.p * (ibasis + 1); + sqmatrix_resize(&VAV, q, 1); + sqmatrix_resize(&S, q, 0); + sqmatrix_resize(&Swork, q, 0); + + for (i = 0; i <= ibasis; ++i) { + evectmatrixXtY_sub(VAV, Y.p * (q * i + ibasis), V[i], AV[ibasis], S3); + } + sqmatrix_copy_upper2full(S, VAV); + + sqmatrix_eigensolve(S, eigenvals2, Swork); + + /* find index itarget of start of "window" around the + target frequency : */ + if (target == 0.0) /* not attempting targeted eigensolver */ + itarget = 0; + else { + /* note that this technique seems to have convergence trouble */ + for (itarget = 0; itarget + Y.p < q && fabs(target - eigenvals2[itarget]) > + fabs(target - eigenvals2[itarget + Y.p]); + ++itarget) + ; + } + + for (E = 0.0, i = 0; i < Y.p; ++i) { + E += (eigenvals[i] = eigenvals2[itarget + i]); + } + mpi_assert_equal(E); + + /* compute Y = best eigenvectors */ + for (i = 0; i <= ibasis; ++i) { + evectmatrix_aXpbYS_sub(i ? 1.0 : 0.0, Y, 1.0, V[i], S, itarget * q + Y.p * i, 1); + } + + if (iteration > 0 && mpi_is_master() && + ((flags & EIGS_VERBOSE) || + MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, prev_feedback_time) > FEEDBACK_TIME)) { + if (mpb_verbosity >= 2) { + printf(" iteration %4d: " + "trace = %0.16g (%g%% change)\n", + iteration, E, 200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E))); + fflush(stdout); /* make sure output appears */ + } + prev_feedback_time = MPIGLUE_CLOCK; /* reset feedback clock */ + } + + if (iteration > 0 && fabs(E - prev_E) < tolerance * 0.5 * (fabs(E) + fabs(prev_E) + 1e-7)) + break; /* convergence! hooray! */ + + /* compute new directions from residual & update basis: */ + { + int ibasis2 = (ibasis + 1) % nbasis; + + /* compute V[ibasis2] = AY */ #if 1 - for (i = 0; i <= ibasis; ++i) { - evectmatrix_aXpbYS_sub(i ? 1.0 : 0.0, V[ibasis2], - 1.0, AV[i], - S, itarget * q + Y.p * i, 1); - } + for (i = 0; i <= ibasis; ++i) { + evectmatrix_aXpbYS_sub(i ? 1.0 : 0.0, V[ibasis2], 1.0, AV[i], S, itarget * q + Y.p * i, 1); + } #else - A(Y, V[ibasis2], Adata, 1, Y); + A(Y, V[ibasis2], Adata, 1, Y); #endif - /* handle restart case: */ - if (ibasis2 == 0) { - evectmatrix_copy(AV[0], V[0]); - evectmatrix_copy(V[0], Y); - sqmatrix_resize(&VAV, Y.p, 0); - evectmatrix_XtY(VAV, V[0], AV[0], S3); - ibasis2 = 1; - evectmatrix_copy(V[ibasis2], AV[0]); - } - - /* V[ibasis2] = residual = AY - Y * eigenvals */ - matrix_XpaY_diag_real(V[ibasis2].data, - -1.0, Y.data, - eigenvals, Y.n, Y.p); - - /* AV[ibasis2] = precondition V[ibasis2]: */ - if (K != NULL) - K(V[ibasis2], AV[ibasis2], Kdata, Y, eigenvals, I); - else - evectmatrix_copy(AV[ibasis2], V[ibasis2]); - - /* project by the constraints, if any: */ - if (constraint) - constraint(AV[ibasis2], constraint_data); - - /* orthogonalize against previous V: */ - for (i = 0; i < ibasis2; ++i) { - evectmatrix_XtY(U, V[i], AV[ibasis2], S3); - evectmatrix_XpaYS(AV[ibasis2], -1.0, V[i], U, 0); - } - - /* orthonormalize within itself: */ - evectmatrix_XtX(U, AV[ibasis2], S3); - CHECK(sqmatrix_invert(U, 1, S3), "non-independent AV subspace"); - sqmatrix_sqrt(S2, U, S3); - evectmatrix_XeYS(V[ibasis2], AV[ibasis2], S2, 1); - - ibasis = ibasis2; - } - - prev_E = E; - } while (++iteration < EIGENSOLVER_MAX_ITERATIONS); - - CHECK(iteration < EIGENSOLVER_MAX_ITERATIONS, - "failure to converge after " - STRINGIZE(EIGENSOLVER_MAX_ITERATIONS) - " iterations"); - - evectmatrix_XtX(U, Y, S3); - CHECK(sqmatrix_invert(U, 1, S3), "singular YtY at end"); - eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, - V[0], AV[0], U, S3, S2); - - free(eigenvals2); - - destroy_sqmatrix(VAV); - destroy_sqmatrix(S); - destroy_sqmatrix(Swork); - destroy_sqmatrix(U); - destroy_sqmatrix(S2); - destroy_sqmatrix(S3); - destroy_sqmatrix(I); - - *num_iterations = iteration; + /* handle restart case: */ + if (ibasis2 == 0) { + evectmatrix_copy(AV[0], V[0]); + evectmatrix_copy(V[0], Y); + sqmatrix_resize(&VAV, Y.p, 0); + evectmatrix_XtY(VAV, V[0], AV[0], S3); + ibasis2 = 1; + evectmatrix_copy(V[ibasis2], AV[0]); + } + + /* V[ibasis2] = residual = AY - Y * eigenvals */ + matrix_XpaY_diag_real(V[ibasis2].data, -1.0, Y.data, eigenvals, Y.n, Y.p); + + /* AV[ibasis2] = precondition V[ibasis2]: */ + if (K != NULL) + K(V[ibasis2], AV[ibasis2], Kdata, Y, eigenvals, I); + else + evectmatrix_copy(AV[ibasis2], V[ibasis2]); + + /* project by the constraints, if any: */ + if (constraint) constraint(AV[ibasis2], constraint_data); + + /* orthogonalize against previous V: */ + for (i = 0; i < ibasis2; ++i) { + evectmatrix_XtY(U, V[i], AV[ibasis2], S3); + evectmatrix_XpaYS(AV[ibasis2], -1.0, V[i], U, 0); + } + + /* orthonormalize within itself: */ + evectmatrix_XtX(U, AV[ibasis2], S3); + CHECK(sqmatrix_invert(U, 1, S3), "non-independent AV subspace"); + sqmatrix_sqrt(S2, U, S3); + evectmatrix_XeYS(V[ibasis2], AV[ibasis2], S2, 1); + + ibasis = ibasis2; + } + + prev_E = E; + } while (++iteration < EIGENSOLVER_MAX_ITERATIONS); + + CHECK(iteration < EIGENSOLVER_MAX_ITERATIONS, + "failure to converge after " STRINGIZE(EIGENSOLVER_MAX_ITERATIONS) " iterations"); + + evectmatrix_XtX(U, Y, S3); + CHECK(sqmatrix_invert(U, 1, S3), "singular YtY at end"); + eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, V[0], AV[0], U, S3, S2); + + free(eigenvals2); + + destroy_sqmatrix(VAV); + destroy_sqmatrix(S); + destroy_sqmatrix(Swork); + destroy_sqmatrix(U); + destroy_sqmatrix(S2); + destroy_sqmatrix(S3); + destroy_sqmatrix(I); + + *num_iterations = iteration; } diff --git a/src/matrices/eigensolver_utils.c b/src/matrices/eigensolver_utils.c index 19016a47..0b503c96 100644 --- a/src/matrices/eigensolver_utils.c +++ b/src/matrices/eigensolver_utils.c @@ -29,41 +29,35 @@ /**************************************************************************/ -void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectmatrix Work1, evectmatrix Work2, - sqmatrix U, sqmatrix Usqrt, - sqmatrix Uwork) -{ - sqmatrix_sqrt(Usqrt, U, Uwork); /* Usqrt = 1/sqrt(Yt*Y) */ - evectmatrix_XeYS(Work1, Y, Usqrt, 1); /* Work1 = orthonormalize(Y) */ - - A(Work1, Work2, Adata, 1, Y); /* Work2 = A Work1; Y is scratch */ - evectmatrix_XtY(U, Work1, Work2, Uwork); /* U = Work1 * A * Work1 */ - - sqmatrix_eigensolve(U, eigenvals, Uwork); - evectmatrix_XeYS(Y, Work1, U, 1); +void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectmatrix Work1, evectmatrix Work2, sqmatrix U, sqmatrix Usqrt, + sqmatrix Uwork) { + sqmatrix_sqrt(Usqrt, U, Uwork); /* Usqrt = 1/sqrt(Yt*Y) */ + evectmatrix_XeYS(Work1, Y, Usqrt, 1); /* Work1 = orthonormalize(Y) */ + + A(Work1, Work2, Adata, 1, Y); /* Work2 = A Work1; Y is scratch */ + evectmatrix_XtY(U, Work1, Work2, Uwork); /* U = Work1 * A * Work1 */ + + sqmatrix_eigensolve(U, eigenvals, Uwork); + evectmatrix_XeYS(Y, Work1, U, 1); } -void eigensolver_get_eigenvals(evectmatrix Y, real *eigenvals, - evectoperator A, void *Adata, - evectmatrix Work1, evectmatrix Work2) -{ - sqmatrix U, Usqrt, Uwork; - - U = create_sqmatrix(Y.p); - Usqrt = create_sqmatrix(Y.p); - Uwork = create_sqmatrix(Y.p); - - evectmatrix_XtX(U, Y, Uwork); - sqmatrix_invert(U, 1, Uwork); - - eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, Work1, Work2, - U, Usqrt, Uwork); - - destroy_sqmatrix(U); - destroy_sqmatrix(Usqrt); - destroy_sqmatrix(Uwork); +void eigensolver_get_eigenvals(evectmatrix Y, real *eigenvals, evectoperator A, void *Adata, + evectmatrix Work1, evectmatrix Work2) { + sqmatrix U, Usqrt, Uwork; + + U = create_sqmatrix(Y.p); + Usqrt = create_sqmatrix(Y.p); + Uwork = create_sqmatrix(Y.p); + + evectmatrix_XtX(U, Y, Uwork); + sqmatrix_invert(U, 1, Uwork); + + eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata, Work1, Work2, U, Usqrt, Uwork); + + destroy_sqmatrix(U); + destroy_sqmatrix(Usqrt); + destroy_sqmatrix(Uwork); } /**************************************************************************/ @@ -71,38 +65,31 @@ void eigensolver_get_eigenvals(evectmatrix Y, real *eigenvals, /* Subroutines for chaining constraints, to make it easy to pass multiple constraint functions to the eigensolver: */ -evectconstraint_chain *evect_add_constraint(evectconstraint_chain *constraints, - evectconstraint C, - void *constraint_data) -{ - evectconstraint_chain *new_constraints; +evectconstraint_chain *evect_add_constraint(evectconstraint_chain *constraints, evectconstraint C, + void *constraint_data) { + evectconstraint_chain *new_constraints; - CHK_MALLOC(new_constraints, evectconstraint_chain, 1); + CHK_MALLOC(new_constraints, evectconstraint_chain, 1); - new_constraints->C = C; - new_constraints->constraint_data = constraint_data; - new_constraints->next = constraints; - return new_constraints; + new_constraints->C = C; + new_constraints->constraint_data = constraint_data; + new_constraints->next = constraints; + return new_constraints; } -void evect_destroy_constraints(evectconstraint_chain *constraints) -{ - while (constraints) { - evectconstraint_chain *cur_constraint = constraints; - constraints = constraints->next; - free(cur_constraint); - } +void evect_destroy_constraints(evectconstraint_chain *constraints) { + while (constraints) { + evectconstraint_chain *cur_constraint = constraints; + constraints = constraints->next; + free(cur_constraint); + } } -void evectconstraint_chain_func(evectmatrix X, void *data) -{ - evectconstraint_chain *constraints = (evectconstraint_chain *) data; +void evectconstraint_chain_func(evectmatrix X, void *data) { + evectconstraint_chain *constraints = (evectconstraint_chain *)data; - while (constraints) { - if (constraints->C) - constraints->C(X, constraints->constraint_data); - constraints = constraints->next; - } + while (constraints) { + if (constraints->C) constraints->C(X, constraints->constraint_data); + constraints = constraints->next; + } } - - diff --git a/src/matrices/evectmatrix.c b/src/matrices/evectmatrix.c index b7434328..7c7adef0 100644 --- a/src/matrices/evectmatrix.c +++ b/src/matrices/evectmatrix.c @@ -34,30 +34,26 @@ double evectmatrix_flops = 0; (X, Y: evectmatrix, S: sqmatrix) */ /* X = Y */ -void evectmatrix_copy(evectmatrix X, evectmatrix Y) -{ - CHECK(X.n == Y.n && X.p == Y.p, "arrays not conformant"); +void evectmatrix_copy(evectmatrix X, evectmatrix Y) { + CHECK(X.n == Y.n && X.p == Y.p, "arrays not conformant"); - blasglue_copy(X.n * X.p, Y.data, 1, X.data, 1); + blasglue_copy(X.n * X.p, Y.data, 1, X.data, 1); } /* set p selected columns of X to those in Y, starting at ix and iy. */ -void evectmatrix_copy_slice(evectmatrix X, evectmatrix Y, - int ix, int iy, int p) -{ - CHECK(ix + p <= X.p && iy + p <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n, - "invalid arguments to evectmatrix_copy_slice"); - - if (ix == 0 && iy == 0 && p == X.p && p == Y.p) - evectmatrix_copy(X, Y); - else if (p == 1) - blasglue_copy(X.n, Y.data + iy, Y.p, X.data + ix, X.p); - else { - int i; - for (i = 0; i < X.n; ++i) - blasglue_copy(p, Y.data + iy + i * Y.p, 1, - X.data + ix + i * X.p, 1); - } +void evectmatrix_copy_slice(evectmatrix X, evectmatrix Y, int ix, int iy, int p) { + CHECK(ix + p <= X.p && iy + p <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n, + "invalid arguments to evectmatrix_copy_slice"); + + if (ix == 0 && iy == 0 && p == X.p && p == Y.p) + evectmatrix_copy(X, Y); + else if (p == 1) + blasglue_copy(X.n, Y.data + iy, Y.p, X.data + ix, X.p); + else { + int i; + for (i = 0; i < X.n; ++i) + blasglue_copy(p, Y.data + iy + i * Y.p, 1, X.data + ix + i * X.p, 1); + } } /* Resize A from its current size to an nxp matrix, assuming that @@ -65,214 +61,182 @@ void evectmatrix_copy_slice(evectmatrix X, evectmatrix Y, If preserve_data is nonzero, copies the existing data in A (or a subset of it, if the matrix is shrinking) to the corresponding entries of the resized matrix. */ -void evectmatrix_resize(evectmatrix *A, int p, short preserve_data) -{ - CHECK(p <= A->alloc_p, "tried to resize beyond allocated limit"); - - if (preserve_data) { - int i, j; - - if (p < A->p) { - for (i = 0; i < A->n; ++i) - for (j = 0; j < p; ++j) - A->data[i*p + j] = A->data[i*A->p + j]; - } - else { - for (i = A->n-1; i >= 0; --i) - for (j = A->p-1; j >= 0; --j) - A->data[i*p + j] = A->data[i*A->p + j]; - } - } - - A->p = p; +void evectmatrix_resize(evectmatrix *A, int p, short preserve_data) { + CHECK(p <= A->alloc_p, "tried to resize beyond allocated limit"); + + if (preserve_data) { + int i, j; + + if (p < A->p) { + for (i = 0; i < A->n; ++i) + for (j = 0; j < p; ++j) + A->data[i * p + j] = A->data[i * A->p + j]; + } + else { + for (i = A->n - 1; i >= 0; --i) + for (j = A->p - 1; j >= 0; --j) + A->data[i * p + j] = A->data[i * A->p + j]; + } + } + + A->p = p; } /* compute X = a*X + b*Y; X and Y may be equal. */ -void evectmatrix_aXpbY(real a, evectmatrix X, real b, evectmatrix Y) -{ - CHECK(X.n == Y.n && X.p == Y.p, "arrays not conformant"); - - if (a != 1.0) - blasglue_rscal(X.n * X.p, a, X.data, 1); - - blasglue_axpy(X.n * X.p, b, Y.data, 1, X.data, 1); - evectmatrix_flops += X.N * X.c * X.p * 3; +void evectmatrix_aXpbY(real a, evectmatrix X, real b, evectmatrix Y) { + CHECK(X.n == Y.n && X.p == Y.p, "arrays not conformant"); + + if (a != 1.0) blasglue_rscal(X.n * X.p, a, X.data, 1); + + blasglue_axpy(X.n * X.p, b, Y.data, 1, X.data, 1); + evectmatrix_flops += X.N * X.c * X.p * 3; } /* Compute X = a*X + b*Y*S. Instead of using the entire S matrix, however, we use only a Y.p x Y.p submatrix, beginning at the element indexed by Soffset. If sdagger != 0, then the adjoint of the submatrix is used instead of the submatrix. */ -void evectmatrix_aXpbYS_sub(real a, evectmatrix X, real b, evectmatrix Y, - sqmatrix S, int Soffset, short sdagger) -{ - if (S.p == 0) /* we treat the S.p == 0 case as if S were the identity */ - evectmatrix_aXpbY(a, X, b, Y); - else { - CHECK(X.n == Y.n && X.p == Y.p && X.p <= S.p, - "arrays not conformant"); - CHECK(Soffset + (Y.p-1)*S.p + Y.p <= S.p*S.p, - "submatrix exceeds matrix bounds"); - blasglue_gemm('N', sdagger ? 'C' : 'N', X.n, X.p, X.p, - b, Y.data, Y.p, S.data + Soffset, S.p, - a, X.data, X.p); - evectmatrix_flops += X.N * X.c * X.p * (3 + 2 * X.p); - } +void evectmatrix_aXpbYS_sub(real a, evectmatrix X, real b, evectmatrix Y, sqmatrix S, int Soffset, + short sdagger) { + if (S.p == 0) /* we treat the S.p == 0 case as if S were the identity */ + evectmatrix_aXpbY(a, X, b, Y); + else { + CHECK(X.n == Y.n && X.p == Y.p && X.p <= S.p, "arrays not conformant"); + CHECK(Soffset + (Y.p - 1) * S.p + Y.p <= S.p * S.p, "submatrix exceeds matrix bounds"); + blasglue_gemm('N', sdagger ? 'C' : 'N', X.n, X.p, X.p, b, Y.data, Y.p, S.data + Soffset, S.p, a, + X.data, X.p); + evectmatrix_flops += X.N * X.c * X.p * (3 + 2 * X.p); + } } /* compute X = YS. If sherm != 0, then S is assumed to be Hermitian. This can be used to make the multiplication more efficient. */ -void evectmatrix_XeYS(evectmatrix X, evectmatrix Y, sqmatrix S, short sherm) -{ - CHECK(S.p == 0 || S.p == Y.p, "arrays not conformant"); - evectmatrix_aXpbYS_sub(0.0, X, 1.0, Y, S, 0, sherm); +void evectmatrix_XeYS(evectmatrix X, evectmatrix Y, sqmatrix S, short sherm) { + CHECK(S.p == 0 || S.p == Y.p, "arrays not conformant"); + evectmatrix_aXpbYS_sub(0.0, X, 1.0, Y, S, 0, sherm); } /* compute X += a Y * S. If sdagger != 0, then St is used instead of S. */ -void evectmatrix_XpaYS(evectmatrix X, real a, evectmatrix Y, - sqmatrix S, short sdagger) -{ - CHECK(S.p == 0 || S.p == Y.p, "arrays not conformant"); - evectmatrix_aXpbYS_sub(1.0, X, a, Y, S, 0, sdagger); +void evectmatrix_XpaYS(evectmatrix X, real a, evectmatrix Y, sqmatrix S, short sdagger) { + CHECK(S.p == 0 || S.p == Y.p, "arrays not conformant"); + evectmatrix_aXpbYS_sub(1.0, X, a, Y, S, 0, sdagger); } /* compute U = adjoint(X) * X, with S a scratch matrix. */ -void evectmatrix_XtX(sqmatrix U, evectmatrix X, sqmatrix S) -{ - CHECK(X.p == U.p && U.p <= S.alloc_p, "matrices not conformant"); - -/* - blasglue_gemm('C', 'N', X.p, X.p, X.n, - 1.0, X.data, X.p, X.data, X.p, 0.0, S.data, U.p); -*/ - - /* take advantage of the fact that U is Hermitian and only write - out the upper triangle of the matrix */ - memset(S.data, 0, sizeof(scalar) * (U.p * U.p)); - blasglue_herk('U', 'C', X.p, X.n, 1.0, X.data, X.p, 0.0, S.data, U.p); - evectmatrix_flops += X.N * X.c * X.p * (X.p - 1); - - /* Now, copy the conjugate of the upper half onto the lower half of S */ - { - int i, j; - - for (i = 0; i < U.p; ++i) - for (j = i + 1; j < U.p; ++j) { - ASSIGN_CONJ(S.data[j * U.p + i], S.data[i * U.p + j]); - } - } - - mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +void evectmatrix_XtX(sqmatrix U, evectmatrix X, sqmatrix S) { + CHECK(X.p == U.p && U.p <= S.alloc_p, "matrices not conformant"); + + /* + blasglue_gemm('C', 'N', X.p, X.p, X.n, + 1.0, X.data, X.p, X.data, X.p, 0.0, S.data, U.p); + */ + + /* take advantage of the fact that U is Hermitian and only write + out the upper triangle of the matrix */ + memset(S.data, 0, sizeof(scalar) * (U.p * U.p)); + blasglue_herk('U', 'C', X.p, X.n, 1.0, X.data, X.p, 0.0, S.data, U.p); + evectmatrix_flops += X.N * X.c * X.p * (X.p - 1); + + /* Now, copy the conjugate of the upper half onto the lower half of S */ + { + int i, j; + + for (i = 0; i < U.p; ++i) + for (j = i + 1; j < U.p; ++j) { + ASSIGN_CONJ(S.data[j * U.p + i], S.data[i * U.p + j]); + } + } + + mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, + mpb_comm); } /* Dot p selected columns of X with q in Y, starting at ix and iy. - Stores the result in U, starting at column iu, with + Stores the result in U, starting at column iu, with S1 and S2 as scratch matrices. */ -void evectmatrix_XtY_slice2(sqmatrix U, evectmatrix X, evectmatrix Y, - int ix, int iy, int p, int q, int iu, - sqmatrix S1, sqmatrix S2) -{ - int i, j; - CHECK(ix + p <= X.p && iy + q <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n - && p == U.p && q <= p && p <= S1.alloc_p && p <= S2.alloc_p, "invalid arguments to XtY_slice2"); - - memset(S1.data, 0, sizeof(scalar) * (U.p * U.p)); - blasglue_gemm('C', 'N', p, q, X.n, - 1.0, X.data + ix, X.p, Y.data + iy, Y.p, 0.0, - S1.data, q); - evectmatrix_flops += X.N * X.c * q * (2*p); - - mpi_allreduce(S1.data, S2.data, p * q * SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - for (i = 0; i < p; ++i) - for (j = 0; j < q; ++j) - U.data[i*p + j + iu] = S2.data[i*q + j]; +void evectmatrix_XtY_slice2(sqmatrix U, evectmatrix X, evectmatrix Y, int ix, int iy, int p, int q, + int iu, sqmatrix S1, sqmatrix S2) { + int i, j; + CHECK(ix + p <= X.p && iy + q <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n && p == U.p && q <= p && + p <= S1.alloc_p && p <= S2.alloc_p, + "invalid arguments to XtY_slice2"); + + memset(S1.data, 0, sizeof(scalar) * (U.p * U.p)); + blasglue_gemm('C', 'N', p, q, X.n, 1.0, X.data + ix, X.p, Y.data + iy, Y.p, 0.0, S1.data, q); + evectmatrix_flops += X.N * X.c * q * (2 * p); + + mpi_allreduce(S1.data, S2.data, p * q * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + for (i = 0; i < p; ++i) + for (j = 0; j < q; ++j) + U.data[i * p + j + iu] = S2.data[i * q + j]; } /* Dot p selected columns of X with those in Y, starting at ix and iy. Stores the result in U, with S a scratch matrix. */ -void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y, - int ix, int iy, int p, sqmatrix S) -{ - evectmatrix_XtY_slice2(U, X, Y, ix, iy, p, p, 0, S, U); +void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y, int ix, int iy, int p, + sqmatrix S) { + evectmatrix_XtY_slice2(U, X, Y, ix, iy, p, p, 0, S, U); } /* compute U = adjoint(X) * Y, with S a scratch matrix. */ -void evectmatrix_XtY(sqmatrix U, evectmatrix X, evectmatrix Y, sqmatrix S) -{ - CHECK(X.p == Y.p, "matrices not conformant"); - - evectmatrix_XtY_slice(U, X, Y, 0, 0, X.p, S); +void evectmatrix_XtY(sqmatrix U, evectmatrix X, evectmatrix Y, sqmatrix S) { + CHECK(X.p == Y.p, "matrices not conformant"); + + evectmatrix_XtY_slice(U, X, Y, 0, 0, X.p, S); } /* Compute adjoint(X) * Y, storing the result in U at an offset Uoffset with the matrix (i.e. as a submatrix within U). S is a scratch matrix (at least Y.p by Y.p). */ -void evectmatrixXtY_sub(sqmatrix U, int Uoffset, evectmatrix X, evectmatrix Y, - sqmatrix S) -{ - int i; - - CHECK(X.p == Y.p && X.n == Y.n && U.p >= Y.p, "matrices not conformant"); - CHECK(Uoffset + (Y.p-1)*U.p + Y.p <= U.p*U.p, - "submatrix exceeds matrix bounds"); - CHECK(Y.p <= S.alloc_p, "scratch matrix too small"); - - memset(S.data, 0, sizeof(scalar) * (Y.p * Y.p)); - blasglue_gemm('C', 'N', X.p, X.p, X.n, - 1.0, X.data, X.p, Y.data, Y.p, 0.0, S.data, Y.p); - evectmatrix_flops += X.N * X.c * X.p * (2*X.p); - - for (i = 0; i < Y.p; ++i) { - mpi_allreduce(S.data + i*Y.p, U.data + Uoffset + i*U.p, - Y.p * SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - } +void evectmatrixXtY_sub(sqmatrix U, int Uoffset, evectmatrix X, evectmatrix Y, sqmatrix S) { + int i; + + CHECK(X.p == Y.p && X.n == Y.n && U.p >= Y.p, "matrices not conformant"); + CHECK(Uoffset + (Y.p - 1) * U.p + Y.p <= U.p * U.p, "submatrix exceeds matrix bounds"); + CHECK(Y.p <= S.alloc_p, "scratch matrix too small"); + + memset(S.data, 0, sizeof(scalar) * (Y.p * Y.p)); + blasglue_gemm('C', 'N', X.p, X.p, X.n, 1.0, X.data, X.p, Y.data, Y.p, 0.0, S.data, Y.p); + evectmatrix_flops += X.N * X.c * X.p * (2 * X.p); + + for (i = 0; i < Y.p; ++i) { + mpi_allreduce(S.data + i * Y.p, U.data + Uoffset + i * U.p, Y.p * SCALAR_NUMVALS, real, + SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + } } /* Compute only the diagonal elements of XtY, storing in diag (with scratch_diag a scratch array of the same size as diag). */ -void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag, - scalar *scratch_diag) -{ - matrix_XtY_diag(X.data, Y.data, X.n, X.p, scratch_diag); - evectmatrix_flops += X.N * X.c * X.p * 2; - mpi_allreduce(scratch_diag, diag, X.p * SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag, scalar *scratch_diag) { + matrix_XtY_diag(X.data, Y.data, X.n, X.p, scratch_diag); + evectmatrix_flops += X.N * X.c * X.p * 2; + mpi_allreduce(scratch_diag, diag, X.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); } /* As above, but only compute real parts of diagonal. */ -void evectmatrix_XtY_diag_real(evectmatrix X, evectmatrix Y, real *diag, - real *scratch_diag) -{ - matrix_XtY_diag_real(X.data, Y.data, X.n, X.p, scratch_diag); - evectmatrix_flops += X.N * X.c * X.p * (2*X.p); - mpi_allreduce(scratch_diag, diag, X.p, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +void evectmatrix_XtY_diag_real(evectmatrix X, evectmatrix Y, real *diag, real *scratch_diag) { + matrix_XtY_diag_real(X.data, Y.data, X.n, X.p, scratch_diag); + evectmatrix_flops += X.N * X.c * X.p * (2 * X.p); + mpi_allreduce(scratch_diag, diag, X.p, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); } /* As above, but compute only the diagonal elements of XtX. */ -void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, real *scratch_diag) -{ - matrix_XtX_diag_real(X.data, X.n, X.p, scratch_diag); - evectmatrix_flops += X.N * X.c * X.p * (2*X.p); - mpi_allreduce(scratch_diag, diag, X.p, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, real *scratch_diag) { + matrix_XtX_diag_real(X.data, X.n, X.p, scratch_diag); + evectmatrix_flops += X.N * X.c * X.p * (2 * X.p); + mpi_allreduce(scratch_diag, diag, X.p, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); } /* compute trace(adjoint(X) * Y) */ -scalar evectmatrix_traceXtY(evectmatrix X, evectmatrix Y) -{ - scalar trace, trace_scratch; +scalar evectmatrix_traceXtY(evectmatrix X, evectmatrix Y) { + scalar trace, trace_scratch; + + CHECK(X.p == Y.p && X.n == Y.n, "matrices not conformant"); - CHECK(X.p == Y.p && X.n == Y.n, "matrices not conformant"); - - trace_scratch = blasglue_dotc(X.n * X.p, X.data, 1, Y.data, 1); - evectmatrix_flops += X.N * X.c * X.p * (2*X.p) + X.p; + trace_scratch = blasglue_dotc(X.n * X.p, X.data, 1, Y.data, 1); + evectmatrix_flops += X.N * X.c * X.p * (2 * X.p) + X.p; - mpi_allreduce(&trace_scratch, &trace, SCALAR_NUMVALS, - real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); + mpi_allreduce(&trace_scratch, &trace, SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); - return trace; + return trace; } diff --git a/src/matrices/linmin.c b/src/matrices/linmin.c index ef685966..921a0b4e 100644 --- a/src/matrices/linmin.c +++ b/src/matrices/linmin.c @@ -29,82 +29,72 @@ #define double linmin_real -extern int dcsrch(double *stp, double *f, double *g, - double *ftol, double *gtol, double *xtol, - char *task, - double *stpmin, double *stpmax, int *isave, - double *dsave); +extern int dcsrch(double *stp, double *f, double *g, double *ftol, double *gtol, double *xtol, + char *task, double *stpmin, double *stpmax, int *isave, double *dsave); /* Our line minimization algorithm is designed only for f(x) for x >= 0. If the minimum lies for negative x, we use reverse_func below to transform f(x) into f(-x). */ typedef struct { - linmin_func f; - void *f_data; + linmin_func f; + void *f_data; } linmin_func_data; -double reverse_func(double x, double *deriv, void *data) -{ - linmin_func_data *d = (linmin_func_data *) data; - double val; - val = d->f(-x, deriv, d->f_data); - *deriv = -*deriv; - return val; +double reverse_func(double x, double *deriv, void *data) { + linmin_func_data *d = (linmin_func_data *)data; + double val; + val = d->f(-x, deriv, d->f_data); + *deriv = -*deriv; + return val; } -double linmin(double *converged_f, double *converged_df, - double x_guess, double f_0, double df_0, - double f_tol, double df_tol, double x_tol, - double x_min, double x_max, - linmin_func f, void *f_data, int verbose) -{ - if (df_0 > 0) { /* minimum lies for negative x; transform to f(-x) */ - linmin_func_data d; - double x; - d.f = f; - d.f_data = f_data; - x = linmin(converged_f, converged_df, - -x_guess, f_0, -df_0, f_tol, df_tol, x_tol, - -x_min, -x_max, reverse_func, &d, verbose); - *converged_df = -*converged_df; - return(-x); - } - else if (df_0 == 0) { /* already at minimum! */ - *converged_f = f_0; - *converged_df = df_0; - return 0; - } - else { - char task[300] = "START"; - int isave[2]; - double dsave[13], x, f_x, df_x; - int iters = 0; +double linmin(double *converged_f, double *converged_df, double x_guess, double f_0, double df_0, + double f_tol, double df_tol, double x_tol, double x_min, double x_max, linmin_func f, + void *f_data, int verbose) { + if (df_0 > 0) { /* minimum lies for negative x; transform to f(-x) */ + linmin_func_data d; + double x; + d.f = f; + d.f_data = f_data; + x = linmin(converged_f, converged_df, -x_guess, f_0, -df_0, f_tol, df_tol, x_tol, -x_min, + -x_max, reverse_func, &d, verbose); + *converged_df = -*converged_df; + return (-x); + } + else if (df_0 == 0) { /* already at minimum! */ + *converged_f = f_0; + *converged_df = df_0; + return 0; + } + else { + char task[300] = "START"; + int isave[2]; + double dsave[13], x, f_x, df_x; + int iters = 0; - x = x_guess; - f_x = f_0; df_x = df_0; /* initially, pass in f and df at x=0 */ - dcsrch(&x, &f_x, &df_x, &f_tol, &df_tol, &x_tol, - task, &x_min, &x_max, isave, dsave); + x = x_guess; + f_x = f_0; + df_x = df_0; /* initially, pass in f and df at x=0 */ + dcsrch(&x, &f_x, &df_x, &f_tol, &df_tol, &x_tol, task, &x_min, &x_max, isave, dsave); - while (*task == 'F') { - f_x = f(x, &df_x, f_data); - mpi_assert_equal(x); - mpi_assert_equal(f_x); - ++iters; - dcsrch(&x, &f_x, &df_x, &f_tol, &df_tol, &x_tol, - task, &x_min, &x_max, isave, dsave); - } + while (*task == 'F') { + f_x = f(x, &df_x, f_data); + mpi_assert_equal(x); + mpi_assert_equal(f_x); + ++iters; + dcsrch(&x, &f_x, &df_x, &f_tol, &df_tol, &x_tol, task, &x_min, &x_max, isave, dsave); + } - if (*task != 'C') { /* not converged; warning or error */ - if ((verbose || mpb_verbosity >= 2 || *task == 'E') && mpb_verbosity >= 2) - mpi_one_fprintf(stderr, "linmin: %s\n", task); - CHECK(*task != 'E', "linmin failure"); - } + if (*task != 'C') { /* not converged; warning or error */ + if ((verbose || mpb_verbosity >= 2 || *task == 'E') && mpb_verbosity >= 2) + mpi_one_fprintf(stderr, "linmin: %s\n", task); + CHECK(*task != 'E', "linmin failure"); + } - if (verbose || mpb_verbosity >= 2) - mpi_one_printf(" linmin: converged after %d iterations.\n", - iters); + if (verbose || mpb_verbosity >= 2) + mpi_one_printf(" linmin: converged after %d iterations.\n", iters); - *converged_f = f_x; - *converged_df = df_x; - return x; - } + *converged_f = f_x; + *converged_df = df_x; + return x; + } } diff --git a/src/matrices/linmin.h b/src/matrices/linmin.h index fd25e087..44cf0cd9 100644 --- a/src/matrices/linmin.h +++ b/src/matrices/linmin.h @@ -24,12 +24,11 @@ typedef long double linmin_real; typedef double linmin_real; #endif -typedef linmin_real (*linmin_func) (linmin_real x, linmin_real *deriv, void *data); +typedef linmin_real (*linmin_func)(linmin_real x, linmin_real *deriv, void *data); -extern linmin_real linmin(linmin_real *converged_f, linmin_real *converged_df, - linmin_real x_guess, linmin_real f_0, linmin_real df_0, - linmin_real f_tol, linmin_real df_tol, linmin_real x_tol, - linmin_real x_min, linmin_real x_max, - linmin_func f, void *f_data, int verbose); +extern linmin_real linmin(linmin_real *converged_f, linmin_real *converged_df, linmin_real x_guess, + linmin_real f_0, linmin_real df_0, linmin_real f_tol, linmin_real df_tol, + linmin_real x_tol, linmin_real x_min, linmin_real x_max, linmin_func f, + void *f_data, int verbose); #endif /* LINMIN_H */ diff --git a/src/matrices/matrices.c b/src/matrices/matrices.c index f564426a..f7592831 100644 --- a/src/matrices/matrices.c +++ b/src/matrices/matrices.c @@ -25,183 +25,152 @@ /* Basic operations: allocation, deallocation, etcetera. */ -evectmatrix create_evectmatrix(int N, int c, int p, - int localN, int Nstart, int allocN) -{ - evectmatrix X; - - CHECK(localN <= N && allocN >= localN && Nstart < N, - "invalid N arguments"); - - X.N = N; - X.localN = localN; - X.Nstart = Nstart; - X.allocN = allocN; - X.c = c; - - X.n = localN * c; - X.alloc_p = X.p = p; - - if (allocN > 0) { - CHK_MALLOC(X.data, scalar, allocN * c * p); - } - else - X.data = NULL; - - return X; -} +evectmatrix create_evectmatrix(int N, int c, int p, int localN, int Nstart, int allocN) { + evectmatrix X; -void destroy_evectmatrix(evectmatrix X) -{ - free(X.data); -} + CHECK(localN <= N && allocN >= localN && Nstart < N, "invalid N arguments"); + + X.N = N; + X.localN = localN; + X.Nstart = Nstart; + X.allocN = allocN; + X.c = c; -sqmatrix create_sqmatrix(int p) -{ - sqmatrix X; - - X.alloc_p = X.p = p; - if (p > 0) { - CHK_MALLOC(X.data, scalar, p * p); - } - else - X.data = (scalar*) NULL; - return X; + X.n = localN * c; + X.alloc_p = X.p = p; + + if (allocN > 0) { CHK_MALLOC(X.data, scalar, allocN * c * p); } + else + X.data = NULL; + + return X; } -void destroy_sqmatrix(sqmatrix X) -{ - free(X.data); +void destroy_evectmatrix(evectmatrix X) { free(X.data); } + +sqmatrix create_sqmatrix(int p) { + sqmatrix X; + + X.alloc_p = X.p = p; + if (p > 0) { CHK_MALLOC(X.data, scalar, p * p); } + else + X.data = (scalar *)NULL; + return X; } +void destroy_sqmatrix(sqmatrix X) { free(X.data); } + /***********************************************************************/ /* a few general matrix operations for diagonal matrices; these will be used both by evectmatrix and sqmatrix routines: */ /* compute diag = diagonal elements of Xt * Y, where X and Y are n x p. */ -void matrix_XtY_diag(scalar *X, scalar *Y, int n, int p, scalar *diag) -{ - int i, j; - - for (j = 0; j < p; ++j) { - ASSIGN_ZERO(diag[j]); - } - - for (i = 0; i < n; ++i) - for (j = 0; j < p; ++j) { - ACCUMULATE_SUM_CONJ_MULT(diag[j], X[i*p+j], Y[i*p+j]); - } +void matrix_XtY_diag(scalar *X, scalar *Y, int n, int p, scalar *diag) { + int i, j; + + for (j = 0; j < p; ++j) { + ASSIGN_ZERO(diag[j]); + } + + for (i = 0; i < n; ++i) + for (j = 0; j < p; ++j) { + ACCUMULATE_SUM_CONJ_MULT(diag[j], X[i * p + j], Y[i * p + j]); + } } /* compute diag = diagonal elements of Re[Xt * Y], where X and Y are n x p. */ -void matrix_XtY_diag_real(scalar *X, scalar *Y, int n, int p, real *diag) -{ - int i, j; - - for (j = 0; j < p; ++j) { - diag[j] = 0; - } - - for (i = 0; i < n; ++i) - for (j = 0; j < p; ++j) { - diag[j] += (SCALAR_RE(X[i*p+j]) * SCALAR_RE(Y[i*p+j]) + - SCALAR_IM(X[i*p+j]) * SCALAR_IM(Y[i*p+j])); - } +void matrix_XtY_diag_real(scalar *X, scalar *Y, int n, int p, real *diag) { + int i, j; + + for (j = 0; j < p; ++j) { + diag[j] = 0; + } + + for (i = 0; i < n; ++i) + for (j = 0; j < p; ++j) { + diag[j] += (SCALAR_RE(X[i * p + j]) * SCALAR_RE(Y[i * p + j]) + + SCALAR_IM(X[i * p + j]) * SCALAR_IM(Y[i * p + j])); + } } /* compute diag = diagonal elements of Xt * X, where X is n x p. */ -void matrix_XtX_diag_real(scalar *X, int n, int p, real *diag) -{ - int i, j; - - for (j = 0; j < p; ++j) { - diag[j] = 0; - } - - for (i = 0; i < n; ++i) - for (j = 0; j < p; ++j) { - ACCUMULATE_SUM_SQ(diag[j], X[i*p+j]); - } +void matrix_XtX_diag_real(scalar *X, int n, int p, real *diag) { + int i, j; + + for (j = 0; j < p; ++j) { + diag[j] = 0; + } + + for (i = 0; i < n; ++i) + for (j = 0; j < p; ++j) { + ACCUMULATE_SUM_SQ(diag[j], X[i * p + j]); + } } /* compute X += a * Y * diag(diag), where X and Y are n x p */ -void matrix_XpaY_diag(scalar *X, real a, scalar *Y, - scalar *diag, int n, int p) -{ - int i, j; - - for (i = 0; i < n; ++i) { - for (j = 0; j < p; ++j) { - scalar c; - ASSIGN_MULT(c, Y[i*p+j], diag[j]); - ASSIGN_SCALAR(X[i*p+j], - SCALAR_RE(X[i*p+j]) + a * SCALAR_RE(c), - SCALAR_IM(X[i*p+j]) + a * SCALAR_IM(c)); - } - } +void matrix_XpaY_diag(scalar *X, real a, scalar *Y, scalar *diag, int n, int p) { + int i, j; + + for (i = 0; i < n; ++i) { + for (j = 0; j < p; ++j) { + scalar c; + ASSIGN_MULT(c, Y[i * p + j], diag[j]); + ASSIGN_SCALAR(X[i * p + j], SCALAR_RE(X[i * p + j]) + a * SCALAR_RE(c), + SCALAR_IM(X[i * p + j]) + a * SCALAR_IM(c)); + } + } } /* compute X += a * Y * diag(diag), where X and Y are n x p and diag is real */ -void matrix_XpaY_diag_real(scalar *X, real a, scalar *Y, - real *diag, int n, int p) -{ - int i, j; - - for (i = 0; i < n; ++i) { - for (j = 0; j < p; ++j) { - real d = a * diag[j]; - ASSIGN_SCALAR(X[i*p+j], - SCALAR_RE(X[i*p+j]) + d * SCALAR_RE(Y[i*p+j]), - SCALAR_IM(X[i*p+j]) + d * SCALAR_IM(Y[i*p+j])); - } - } +void matrix_XpaY_diag_real(scalar *X, real a, scalar *Y, real *diag, int n, int p) { + int i, j; + + for (i = 0; i < n; ++i) { + for (j = 0; j < p; ++j) { + real d = a * diag[j]; + ASSIGN_SCALAR(X[i * p + j], SCALAR_RE(X[i * p + j]) + d * SCALAR_RE(Y[i * p + j]), + SCALAR_IM(X[i * p + j]) + d * SCALAR_IM(Y[i * p + j])); + } + } } -/* compute X = X * diag1 + Y * diag2, where X and Y are n x p and +/* compute X = X * diag1 + Y * diag2, where X and Y are n x p and diag1 and diag2 are real diagonal matrices */ -void matrix_X_diag_real_pY_diag_real(scalar *X, real *diag1, - scalar *Y, real *diag2, int n, int p) -{ - int i, j; - - for (i = 0; i < n; ++i) { - for (j = 0; j < p; ++j) { - real d1 = diag1[j], d2 = diag2[j]; - ASSIGN_SCALAR(X[i*p+j], - d1 * SCALAR_RE(X[i*p+j]) + - d2 * SCALAR_RE(Y[i*p+j]), - d1 * SCALAR_IM(X[i*p+j]) + - d2 * SCALAR_IM(Y[i*p+j])); - } - } +void matrix_X_diag_real_pY_diag_real(scalar *X, real *diag1, scalar *Y, real *diag2, int n, int p) { + int i, j; + + for (i = 0; i < n; ++i) { + for (j = 0; j < p; ++j) { + real d1 = diag1[j], d2 = diag2[j]; + ASSIGN_SCALAR(X[i * p + j], d1 * SCALAR_RE(X[i * p + j]) + d2 * SCALAR_RE(Y[i * p + j]), + d1 * SCALAR_IM(X[i * p + j]) + d2 * SCALAR_IM(Y[i * p + j])); + } + } } /* compute Re [ trace A * diag(diag) ], where A is p by p. */ -real matrix_re_trace_A_diag_real(scalar *A, real *diag, int p) -{ - real trace = 0.0; - int i; - for (i = 0; i < p; ++i) - trace += SCALAR_RE(A[i*(p+1)]) * diag[i]; - return trace; +real matrix_re_trace_A_diag_real(scalar *A, real *diag, int p) { + real trace = 0.0; + int i; + for (i = 0; i < p; ++i) + trace += SCALAR_RE(A[i * (p + 1)]) * diag[i]; + return trace; } -scalar matrix_diag_trace(scalar *diag, int p) -{ - scalar trace = SCALAR_INIT_ZERO; - int i; - for (i = 0; i < p; ++i) { - ACCUMULATE_SUM(trace, diag[i]); - } - return trace; +scalar matrix_diag_trace(scalar *diag, int p) { + scalar trace = SCALAR_INIT_ZERO; + int i; + for (i = 0; i < p; ++i) { + ACCUMULATE_SUM(trace, diag[i]); + } + return trace; } -real matrix_diag_real_trace(real *diag, int p) -{ - real trace = 0.0; - int i; - for (i = 0; i < p; ++i) - trace += diag[i]; - return trace; +real matrix_diag_real_trace(real *diag, int p) { + real trace = 0.0; + int i; + for (i = 0; i < p; ++i) + trace += diag[i]; + return trace; } diff --git a/src/matrices/matrices.h b/src/matrices/matrices.h index 460275b8..e4319c2d 100644 --- a/src/matrices/matrices.h +++ b/src/matrices/matrices.h @@ -25,15 +25,15 @@ extern "C" { #endif /* __cplusplus */ typedef struct { - int N, localN, Nstart, allocN; - int c; - int n, p, alloc_p; - scalar *data; + int N, localN, Nstart, allocN; + int c; + int n, p, alloc_p; + scalar *data; } evectmatrix; typedef struct { - int p, alloc_p; - scalar *data; + int p, alloc_p; + scalar *data; } sqmatrix; /* try to keep track of flops, at least from evectmatrix multiplications */ @@ -41,8 +41,7 @@ extern double evectmatrix_flops; /* general creation/destruction operations: */ -extern evectmatrix create_evectmatrix(int N, int c, int p, - int localN, int Nstart, int allocN); +extern evectmatrix create_evectmatrix(int N, int c, int p, int localN, int Nstart, int allocN); extern void destroy_evectmatrix(evectmatrix X); extern sqmatrix create_sqmatrix(int p); extern void destroy_sqmatrix(sqmatrix X); @@ -50,16 +49,12 @@ extern void destroy_sqmatrix(sqmatrix X); /* diagonal matrix utils: */ extern void matrix_XtY_diag(scalar *X, scalar *Y, int n, int p, scalar *diag); -extern void matrix_XtY_diag_real(scalar *X, scalar *Y, int n, int p, - real *diag); +extern void matrix_XtY_diag_real(scalar *X, scalar *Y, int n, int p, real *diag); extern void matrix_XtX_diag_real(scalar *X, int n, int p, real *diag); -extern void matrix_XpaY_diag(scalar *X, real a, scalar *Y, - scalar *diag, int n, int p); -extern void matrix_XpaY_diag_real(scalar *X, real a, scalar *Y, - real *diag, int n, int p); -extern void matrix_X_diag_real_pY_diag_real(scalar *X, real *diag1, - scalar *Y, real *diag2, - int n, int p); +extern void matrix_XpaY_diag(scalar *X, real a, scalar *Y, scalar *diag, int n, int p); +extern void matrix_XpaY_diag_real(scalar *X, real a, scalar *Y, real *diag, int n, int p); +extern void matrix_X_diag_real_pY_diag_real(scalar *X, real *diag1, scalar *Y, real *diag2, int n, + int p); extern real matrix_re_trace_A_diag_real(scalar *A, real *diag, int p); extern scalar matrix_diag_trace(scalar *diag, int p); extern real matrix_diag_real_trace(real *diag, int p); @@ -68,32 +63,22 @@ extern real matrix_diag_real_trace(real *diag, int p); extern void evectmatrix_resize(evectmatrix *A, int p, short preserve_data); extern void evectmatrix_copy(evectmatrix X, evectmatrix Y); -extern void evectmatrix_copy_slice(evectmatrix X, evectmatrix Y, - int ix, int iy, int p); +extern void evectmatrix_copy_slice(evectmatrix X, evectmatrix Y, int ix, int iy, int p); extern void evectmatrix_aXpbY(real a, evectmatrix X, real b, evectmatrix Y); -extern void evectmatrix_aXpbYS_sub(real a, evectmatrix X, - real b, evectmatrix Y, - sqmatrix S, int Soffset, short sdagger); -extern void evectmatrix_XeYS(evectmatrix X, evectmatrix Y, - sqmatrix S, short sherm); -extern void evectmatrix_XpaYS(evectmatrix X, real a, evectmatrix Y, - sqmatrix S, short sdagger); +extern void evectmatrix_aXpbYS_sub(real a, evectmatrix X, real b, evectmatrix Y, sqmatrix S, + int Soffset, short sdagger); +extern void evectmatrix_XeYS(evectmatrix X, evectmatrix Y, sqmatrix S, short sherm); +extern void evectmatrix_XpaYS(evectmatrix X, real a, evectmatrix Y, sqmatrix S, short sdagger); extern void evectmatrix_XtX(sqmatrix U, evectmatrix X, sqmatrix S); -extern void evectmatrix_XtY(sqmatrix U, evectmatrix X, evectmatrix Y, - sqmatrix S); -extern void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y, - int ix, int iy, int p, sqmatrix S); -extern void evectmatrix_XtY_slice2(sqmatrix U, evectmatrix X, evectmatrix Y, - int ix, int iy, int p, int q, int iu, - sqmatrix S1, sqmatrix S2); -extern void evectmatrixXtY_sub(sqmatrix U, int Uoffset, - evectmatrix X, evectmatrix Y, sqmatrix S); -extern void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag, - scalar *scratch_diag); -extern void evectmatrix_XtY_diag_real(evectmatrix X, evectmatrix Y, - real *diag, real *scratch_diag); -extern void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, - real *scratch_diag); +extern void evectmatrix_XtY(sqmatrix U, evectmatrix X, evectmatrix Y, sqmatrix S); +extern void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y, int ix, int iy, int p, + sqmatrix S); +extern void evectmatrix_XtY_slice2(sqmatrix U, evectmatrix X, evectmatrix Y, int ix, int iy, int p, + int q, int iu, sqmatrix S1, sqmatrix S2); +extern void evectmatrixXtY_sub(sqmatrix U, int Uoffset, evectmatrix X, evectmatrix Y, sqmatrix S); +extern void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag, scalar *scratch_diag); +extern void evectmatrix_XtY_diag_real(evectmatrix X, evectmatrix Y, real *diag, real *scratch_diag); +extern void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, real *scratch_diag); extern scalar evectmatrix_traceXtY(evectmatrix X, evectmatrix Y); /* sqmatrix operations, defined in sqmatrix.c: */ @@ -105,21 +90,19 @@ extern void sqmatrix_copy_upper2full(sqmatrix F, sqmatrix U); extern void sqmatrix_symmetrize(sqmatrix Asym, sqmatrix A); extern scalar sqmatrix_trace(sqmatrix U); extern scalar sqmatrix_traceAtB(sqmatrix A, sqmatrix B); -extern void sqmatrix_AeBC(sqmatrix A, sqmatrix B, short bdagger, - sqmatrix C, short cdagger); -extern void sqmatrix_ApaBC(sqmatrix A, real a, sqmatrix B, short bdagger, - sqmatrix C, short cdagger); +extern void sqmatrix_AeBC(sqmatrix A, sqmatrix B, short bdagger, sqmatrix C, short cdagger); +extern void sqmatrix_ApaBC(sqmatrix A, real a, sqmatrix B, short bdagger, sqmatrix C, + short cdagger); extern void sqmatrix_ApaB(sqmatrix A, real a, sqmatrix B); extern void sqmatrix_aApbB(real a, sqmatrix A, real b, sqmatrix B); -extern int sqmatrix_invert(sqmatrix U, short positive_definite, - sqmatrix Work); +extern int sqmatrix_invert(sqmatrix U, short positive_definite, sqmatrix Work); extern void sqmatrix_eigensolve(sqmatrix U, real *eigenvals, sqmatrix W); extern void sqmatrix_gen_eigensolve(sqmatrix U, sqmatrix B, real *eigenvals, sqmatrix W); extern void sqmatrix_eigenvalues(sqmatrix A, scalar_complex *eigenvals); extern void sqmatrix_sqrt(sqmatrix Usqrt, sqmatrix U, sqmatrix W); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif /* __cplusplus */ #endif /* MATRICES_H */ diff --git a/src/matrices/minpack2-linmin.c b/src/matrices/minpack2-linmin.c index b3c129f6..a276e06d 100644 --- a/src/matrices/minpack2-linmin.c +++ b/src/matrices/minpack2-linmin.c @@ -23,15 +23,15 @@ pages and ftp sites for MINPACK-2 can be found at: http://www.mcs.anl.gov/~more/minpack-2/minpack-2.html - ftp://info.mcs.anl.gov/pub/MINPACK-2/ - http://www-fp.mcs.anl.gov/otc/minpack/summary.html + ftp://info.mcs.anl.gov/pub/MINPACK-2/ + http://www-fp.mcs.anl.gov/otc/minpack/summary.html It implements the line search algorithm described in: Jorge J. More and David J. Thuente, "Line search algorithms with guaranteed sufficient decrease," ACM Trans. on Mathematical Software, vol. 20, no. 3, pp. 286-307 (September 1994). - + The original code was under the copyright and license listed below, but Jorge J. More' graciously granted me permission to distribute under the terms of the GNU General Public License. @@ -46,24 +46,23 @@ * as to the suitability and operability of this software for any * purpose. It is provided "as is" without express or implied * warranty. - * + * * Use of this software for commercial purposes is expressly * prohibited without contacting. - * + * * Jorge J. More' * Mathematics and Computer Science Division * Argonne National Laboratory * 9700 S. Cass Ave. * Argonne, Illinois 60439-4844 * e-mail: more@mcs.anl.gov - * + * * Argonne National Laboratory with facilities in the states of * Illinois and Idaho, is owned by The United States Government, and * operated by the University of Chicago under provision of a contract * with the Department of Energy. */ - /* minpack2-linmin.f -- translated by f2c (version 19991025). C code cleaned up by Steven G. Johnson . */ @@ -86,26 +85,25 @@ typedef int ftnlen; #ifndef HAVE_STRNCMP /* provide a strncmp replacement if the system does not provide one: */ -static int strncmp(const char *s1, const char *s2, size_t n) -{ - size_t i; - for (i = 0; i < n && s1[i] && s2[i] && s1[i] == s2[i]; ++i) - ; - if (i >= n) - return 0; - else - return (s1[i] - s2[i]); +static int strncmp(const char *s1, const char *s2, size_t n) { + size_t i; + for (i = 0; i < n && s1[i] && s2[i] && s1[i] == s2[i]; ++i) + ; + if (i >= n) + return 0; + else + return (s1[i] - s2[i]); } #endif /* ! HAVE_STRNCMP */ -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) #if defined(SCALAR_LONG_DOUBLE_PREC) -# define ABS(x) fabsl(x) -# define sqrt sqrtl +#define ABS(x) fabsl(x) +#define sqrt sqrtl #else -# define ABS(x) fabs(x) +#define ABS(x) fabs(x) #endif #define s_cmp(s1, s2, len1, len2) strncmp(s1, s2, MIN(len1, len2)) #define s_copy(s1, s2, len1, len2) strcpy(s1, s2) @@ -114,623 +112,579 @@ static int strncmp(const char *s1, const char *s2, size_t n) /* cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ /* cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ -/* Subroutine */ int dcsrch(doublereal *stp, doublereal *f, doublereal *g, - doublereal *ftol, doublereal *gtol, doublereal *xtol, char *task, - doublereal *stpmin, doublereal *stpmax, integer *isave, doublereal * - dsave) -{ - /* System generated locals */ - doublereal d__1; - - /* Builtin functions */ - - /* Local variables */ - integer stage; - doublereal finit, ginit, width, ftest, gtest, stmin, stmax, width1, fm, - gm, fx, fy, gx, gy; - logical brackt; - extern /* Subroutine */ int dcstep(doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, doublereal *, - doublereal *); - doublereal fxm, fym, gxm, gym, stx, sty; - -/* ********** */ +/* Subroutine */ int dcsrch(doublereal *stp, doublereal *f, doublereal *g, doublereal *ftol, + doublereal *gtol, doublereal *xtol, char *task, doublereal *stpmin, + doublereal *stpmax, integer *isave, doublereal *dsave) { + /* System generated locals */ + doublereal d__1; -/* Subroutine dcsrch */ - -/* This subroutine finds a step that satisfies a sufficient */ -/* decrease condition and a curvature condition. */ - -/* Each call of the subroutine updates an interval with */ -/* endpoints stx and sty. The interval is initially chosen */ -/* so that it contains a minimizer of the modified function */ - -/* psi(stp) = f(stp) - f(0) - ftol*stp*f'(0). */ + /* Builtin functions */ + + /* Local variables */ + integer stage; + doublereal finit, ginit, width, ftest, gtest, stmin, stmax, width1, fm, gm, fx, fy, gx, gy; + logical brackt; + extern /* Subroutine */ int dcstep(doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, logical *, doublereal *, doublereal *); + doublereal fxm, fym, gxm, gym, stx, sty; -/* If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the */ -/* interval is chosen so that it contains a minimizer of f. */ + /* ********** */ -/* The algorithm is designed to find a step that satisfies */ -/* the sufficient decrease condition */ - -/* f(stp) <= f(0) + ftol*stp*f'(0), */ + /* Subroutine dcsrch */ -/* and the curvature condition */ + /* This subroutine finds a step that satisfies a sufficient */ + /* decrease condition and a curvature condition. */ -/* ABS(f'(stp)) <= gtol*ABS(f'(0)). */ + /* Each call of the subroutine updates an interval with */ + /* endpoints stx and sty. The interval is initially chosen */ + /* so that it contains a minimizer of the modified function */ -/* If ftol is less than gtol and if, for example, the function */ -/* is bounded below, then there is always a step which satisfies */ -/* both conditions. */ + /* psi(stp) = f(stp) - f(0) - ftol*stp*f'(0). */ -/* If no step can be found that satisfies both conditions, then */ -/* the algorithm stops with a warning. In this case stp only */ -/* satisfies the sufficient decrease condition. */ + /* If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the */ + /* interval is chosen so that it contains a minimizer of f. */ -/* A typical invocation of dcsrch has the following outline: */ + /* The algorithm is designed to find a step that satisfies */ + /* the sufficient decrease condition */ -/* Evaluate the function at stp = 0.0d0; store in f. */ -/* Evaluate the gradient at stp = 0.0d0; store in g. */ -/* Choose a starting step stp. */ + /* f(stp) <= f(0) + ftol*stp*f'(0), */ -/* task = 'START' */ -/* 10 continue */ -/* call dcsrch(stp,f,g,ftol,gtol,xtol,task,stpmin,stpmax, */ -/* + isave,dsave) */ -/* if (task .eq. 'FG') then */ -/* Evaluate the function and the gradient at stp */ -/* go to 10 */ -/* end if */ + /* and the curvature condition */ -/* NOTE: The user must not alter work arrays between calls. */ + /* ABS(f'(stp)) <= gtol*ABS(f'(0)). */ -/* The subroutine statement is */ - -/* subroutine dcsrch(f,g,stp,ftol,gtol,xtol,stpmin,stpmax, */ -/* task,isave,dsave) */ -/* where */ - -/* stp is a double precision variable. */ -/* On entry stp is the current estimate of a satisfactory */ -/* step. On initial entry, a positive initial estimate */ -/* must be provided. */ -/* On exit stp is the current estimate of a satisfactory step */ -/* if task = 'FG'. If task = 'CONV' then stp satisfies */ -/* the sufficient decrease and curvature condition. */ - -/* f is a double precision variable. */ -/* On initial entry f is the value of the function at 0. */ -/* On subsequent entries f is the value of the */ -/* function at stp. */ -/* On exit f is the value of the function at stp. */ - -/* g is a double precision variable. */ -/* On initial entry g is the derivative of the function at 0. */ -/* On subsequent entries g is the derivative of the */ -/* function at stp. */ -/* On exit g is the derivative of the function at stp. */ - -/* ftol is a double precision variable. */ -/* On entry ftol specifies a nonnegative tolerance for the */ -/* sufficient decrease condition. */ -/* On exit ftol is unchanged. */ - -/* gtol is a double precision variable. */ -/* On entry gtol specifies a nonnegative tolerance for the */ -/* curvature condition. */ -/* On exit gtol is unchanged. */ - -/* xtol is a double precision variable. */ -/* On entry xtol specifies a nonnegative relative tolerance */ -/* for an acceptable step. The subroutine exits with a */ -/* warning if the relative difference between sty and stx */ -/* is less than xtol. */ -/* On exit xtol is unchanged. */ - -/* task is a character variable of length at least 60. */ -/* On initial entry task must be set to 'START'. */ -/* On exit task indicates the required action: */ - -/* If task(1:2) = 'FG' then evaluate the function and */ -/* derivative at stp and call dcsrch again. */ - -/* If task(1:4) = 'CONV' then the search is successful. */ - -/* If task(1:4) = 'WARN' then the subroutine is not able */ -/* to satisfy the convergence conditions. The exit value of */ -/* stp contains the best point found during the search. */ - -/* If task(1:5) = 'ERROR' then there is an error in the */ -/* input arguments. */ - -/* On exit with convergence, a warning or an error, the */ -/* variable task contains additional information. */ - -/* stpmin is a double precision variable. */ -/* On entry stpmin is a nonnegative lower bound for the step. */ -/* On exit stpmin is unchanged. */ - -/* stpmax is a double precision variable. */ -/* On entry stpmax is a nonnegative upper bound for the step. */ -/* On exit stpmax is unchanged. */ - -/* isave is an integer work array of dimension 2. */ - -/* dsave is a double precision work array of dimension 13. */ - -/* Subprograms called */ - -/* MINPACK-2 ... dcstep */ - -/* MINPACK-1 Project. June 1983. */ -/* Argonne National Laboratory. */ -/* Jorge J. More' and David J. Thuente. */ - -/* MINPACK-2 Project. November 1993. */ -/* Argonne National Laboratory and University of Minnesota. */ -/* Brett M. Averick, Richard G. Carter, and Jorge J. More'. */ - -/* ********** */ -/* Initialization block. */ - /* Parameter adjustments */ - --dsave; - --isave; - - /* Function Body */ - if (s_cmp(task, "START", (ftnlen)5, (ftnlen)5) == 0) { -/* Check the input arguments for errors. */ - if (*stp < *stpmin) { - s_copy(task, "ERROR: STP .LT. STPMIN", task_len, (ftnlen)22); - } - if (*stp > *stpmax) { - s_copy(task, "ERROR: STP .GT. STPMAX", task_len, (ftnlen)22); - } - if (*g >= 0.) { - s_copy(task, "ERROR: INITIAL G .GE. ZERO", task_len, (ftnlen)26); - } - if (*ftol < 0.) { - s_copy(task, "ERROR: FTOL .LT. ZERO", task_len, (ftnlen)21); - } - if (*gtol < 0.) { - s_copy(task, "ERROR: GTOL .LT. ZERO", task_len, (ftnlen)21); - } - if (*xtol < 0.) { - s_copy(task, "ERROR: XTOL .LT. ZERO", task_len, (ftnlen)21); - } - if (*stpmin < 0.) { - s_copy(task, "ERROR: STPMIN .LT. ZERO", task_len, (ftnlen)23); - } - if (*stpmax < *stpmin) { - s_copy(task, "ERROR: STPMAX .LT. STPMIN", task_len, (ftnlen)25); - } -/* Exit if there are errors on input. */ - if (s_cmp(task, "ERROR", (ftnlen)5, (ftnlen)5) == 0) { - return 0; - } -/* Initialize local variables. */ - brackt = FALSE_; - stage = 1; - finit = *f; - ginit = *g; - gtest = *ftol * ginit; - width = *stpmax - *stpmin; - width1 = width / .5; -/* The variables stx, fx, gx contain the values of the step, */ -/* function, and derivative at the best step. */ -/* The variables sty, fy, gy contain the value of the step, */ -/* function, and derivative at sty. */ -/* The variables stp, f, g contain the values of the step, */ -/* function, and derivative at stp. */ - stx = 0.; - fx = finit; - gx = ginit; - sty = 0.; - fy = finit; - gy = ginit; - stmin = 0.; - stmax = *stp + *stp * 4.; - s_copy(task, "FG", task_len, (ftnlen)2); - goto L10; - } else { -/* Restore local variables. */ - if (isave[1] == 1) { - brackt = TRUE_; - } else { - brackt = FALSE_; - } - stage = isave[2]; - ginit = dsave[1]; - gtest = dsave[2]; - gx = dsave[3]; - gy = dsave[4]; - finit = dsave[5]; - fx = dsave[6]; - fy = dsave[7]; - stx = dsave[8]; - sty = dsave[9]; - stmin = dsave[10]; - stmax = dsave[11]; - width = dsave[12]; - width1 = dsave[13]; - } -/* If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the */ -/* algorithm enters the second stage. */ - ftest = finit + *stp * gtest; - if (stage == 1 && *f <= ftest && *g >= 0.) { - stage = 2; - } -/* Test for warnings. */ - if (brackt && (*stp <= stmin || *stp >= stmax)) { - s_copy(task, "WARNING: ROUNDING ERRORS PREVENT PROGRESS", task_len, ( - ftnlen)41); - } - if (brackt && stmax - stmin <= *xtol * stmax) { - s_copy(task, "WARNING: XTOL TEST SATISFIED", task_len, (ftnlen)28); - } - if (*stp == *stpmax && *f <= ftest && *g <= gtest) { - s_copy(task, "WARNING: STP = STPMAX", task_len, (ftnlen)21); + /* If ftol is less than gtol and if, for example, the function */ + /* is bounded below, then there is always a step which satisfies */ + /* both conditions. */ + + /* If no step can be found that satisfies both conditions, then */ + /* the algorithm stops with a warning. In this case stp only */ + /* satisfies the sufficient decrease condition. */ + + /* A typical invocation of dcsrch has the following outline: */ + + /* Evaluate the function at stp = 0.0d0; store in f. */ + /* Evaluate the gradient at stp = 0.0d0; store in g. */ + /* Choose a starting step stp. */ + + /* task = 'START' */ + /* 10 continue */ + /* call dcsrch(stp,f,g,ftol,gtol,xtol,task,stpmin,stpmax, */ + /* + isave,dsave) */ + /* if (task .eq. 'FG') then */ + /* Evaluate the function and the gradient at stp */ + /* go to 10 */ + /* end if */ + + /* NOTE: The user must not alter work arrays between calls. */ + + /* The subroutine statement is */ + + /* subroutine dcsrch(f,g,stp,ftol,gtol,xtol,stpmin,stpmax, */ + /* task,isave,dsave) */ + /* where */ + + /* stp is a double precision variable. */ + /* On entry stp is the current estimate of a satisfactory */ + /* step. On initial entry, a positive initial estimate */ + /* must be provided. */ + /* On exit stp is the current estimate of a satisfactory step */ + /* if task = 'FG'. If task = 'CONV' then stp satisfies */ + /* the sufficient decrease and curvature condition. */ + + /* f is a double precision variable. */ + /* On initial entry f is the value of the function at 0. */ + /* On subsequent entries f is the value of the */ + /* function at stp. */ + /* On exit f is the value of the function at stp. */ + + /* g is a double precision variable. */ + /* On initial entry g is the derivative of the function at 0. */ + /* On subsequent entries g is the derivative of the */ + /* function at stp. */ + /* On exit g is the derivative of the function at stp. */ + + /* ftol is a double precision variable. */ + /* On entry ftol specifies a nonnegative tolerance for the */ + /* sufficient decrease condition. */ + /* On exit ftol is unchanged. */ + + /* gtol is a double precision variable. */ + /* On entry gtol specifies a nonnegative tolerance for the */ + /* curvature condition. */ + /* On exit gtol is unchanged. */ + + /* xtol is a double precision variable. */ + /* On entry xtol specifies a nonnegative relative tolerance */ + /* for an acceptable step. The subroutine exits with a */ + /* warning if the relative difference between sty and stx */ + /* is less than xtol. */ + /* On exit xtol is unchanged. */ + + /* task is a character variable of length at least 60. */ + /* On initial entry task must be set to 'START'. */ + /* On exit task indicates the required action: */ + + /* If task(1:2) = 'FG' then evaluate the function and */ + /* derivative at stp and call dcsrch again. */ + + /* If task(1:4) = 'CONV' then the search is successful. */ + + /* If task(1:4) = 'WARN' then the subroutine is not able */ + /* to satisfy the convergence conditions. The exit value of */ + /* stp contains the best point found during the search. */ + + /* If task(1:5) = 'ERROR' then there is an error in the */ + /* input arguments. */ + + /* On exit with convergence, a warning or an error, the */ + /* variable task contains additional information. */ + + /* stpmin is a double precision variable. */ + /* On entry stpmin is a nonnegative lower bound for the step. */ + /* On exit stpmin is unchanged. */ + + /* stpmax is a double precision variable. */ + /* On entry stpmax is a nonnegative upper bound for the step. */ + /* On exit stpmax is unchanged. */ + + /* isave is an integer work array of dimension 2. */ + + /* dsave is a double precision work array of dimension 13. */ + + /* Subprograms called */ + + /* MINPACK-2 ... dcstep */ + + /* MINPACK-1 Project. June 1983. */ + /* Argonne National Laboratory. */ + /* Jorge J. More' and David J. Thuente. */ + + /* MINPACK-2 Project. November 1993. */ + /* Argonne National Laboratory and University of Minnesota. */ + /* Brett M. Averick, Richard G. Carter, and Jorge J. More'. */ + + /* ********** */ + /* Initialization block. */ + /* Parameter adjustments */ + --dsave; + --isave; + + /* Function Body */ + if (s_cmp(task, "START", (ftnlen)5, (ftnlen)5) == 0) { + /* Check the input arguments for errors. */ + if (*stp < *stpmin) { s_copy(task, "ERROR: STP .LT. STPMIN", task_len, (ftnlen)22); } + if (*stp > *stpmax) { s_copy(task, "ERROR: STP .GT. STPMAX", task_len, (ftnlen)22); } + if (*g >= 0.) { s_copy(task, "ERROR: INITIAL G .GE. ZERO", task_len, (ftnlen)26); } + if (*ftol < 0.) { s_copy(task, "ERROR: FTOL .LT. ZERO", task_len, (ftnlen)21); } + if (*gtol < 0.) { s_copy(task, "ERROR: GTOL .LT. ZERO", task_len, (ftnlen)21); } + if (*xtol < 0.) { s_copy(task, "ERROR: XTOL .LT. ZERO", task_len, (ftnlen)21); } + if (*stpmin < 0.) { s_copy(task, "ERROR: STPMIN .LT. ZERO", task_len, (ftnlen)23); } + if (*stpmax < *stpmin) { s_copy(task, "ERROR: STPMAX .LT. STPMIN", task_len, (ftnlen)25); } + /* Exit if there are errors on input. */ + if (s_cmp(task, "ERROR", (ftnlen)5, (ftnlen)5) == 0) { return 0; } + /* Initialize local variables. */ + brackt = FALSE_; + stage = 1; + finit = *f; + ginit = *g; + gtest = *ftol * ginit; + width = *stpmax - *stpmin; + width1 = width / .5; + /* The variables stx, fx, gx contain the values of the step, */ + /* function, and derivative at the best step. */ + /* The variables sty, fy, gy contain the value of the step, */ + /* function, and derivative at sty. */ + /* The variables stp, f, g contain the values of the step, */ + /* function, and derivative at stp. */ + stx = 0.; + fx = finit; + gx = ginit; + sty = 0.; + fy = finit; + gy = ginit; + stmin = 0.; + stmax = *stp + *stp * 4.; + s_copy(task, "FG", task_len, (ftnlen)2); + goto L10; + } + else { + /* Restore local variables. */ + if (isave[1] == 1) { brackt = TRUE_; } + else { + brackt = FALSE_; } - if (*stp == *stpmin && (*f > ftest || *g >= gtest)) { - s_copy(task, "WARNING: STP = STPMIN", task_len, (ftnlen)21); + stage = isave[2]; + ginit = dsave[1]; + gtest = dsave[2]; + gx = dsave[3]; + gy = dsave[4]; + finit = dsave[5]; + fx = dsave[6]; + fy = dsave[7]; + stx = dsave[8]; + sty = dsave[9]; + stmin = dsave[10]; + stmax = dsave[11]; + width = dsave[12]; + width1 = dsave[13]; + } + /* If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the */ + /* algorithm enters the second stage. */ + ftest = finit + *stp * gtest; + if (stage == 1 && *f <= ftest && *g >= 0.) { stage = 2; } + /* Test for warnings. */ + if (brackt && (*stp <= stmin || *stp >= stmax)) { + s_copy(task, "WARNING: ROUNDING ERRORS PREVENT PROGRESS", task_len, (ftnlen)41); + } + if (brackt && stmax - stmin <= *xtol * stmax) { + s_copy(task, "WARNING: XTOL TEST SATISFIED", task_len, (ftnlen)28); + } + if (*stp == *stpmax && *f <= ftest && *g <= gtest) { + s_copy(task, "WARNING: STP = STPMAX", task_len, (ftnlen)21); + } + if (*stp == *stpmin && (*f > ftest || *g >= gtest)) { + s_copy(task, "WARNING: STP = STPMIN", task_len, (ftnlen)21); + } + /* Test for convergence. */ + if (*f <= ftest && ABS(*g) <= *gtol * (-ginit)) { + s_copy(task, "CONVERGENCE", task_len, (ftnlen)11); + } + /* Test for termination. */ + if (s_cmp(task, "WARN", (ftnlen)4, (ftnlen)4) == 0 || + s_cmp(task, "CONV", (ftnlen)4, (ftnlen)4) == 0) { + goto L10; + } + /* A modified function is used to predict the step during the */ + /* first stage if a lower function value has been obtained but */ + /* the decrease is not sufficient. */ + if (stage == 1 && *f <= fx && *f > ftest) { + /* Define the modified function and derivative values. */ + fm = *f - *stp * gtest; + fxm = fx - stx * gtest; + fym = fy - sty * gtest; + gm = *g - gtest; + gxm = gx - gtest; + gym = gy - gtest; + /* Call dcstep to update stx, sty, and to compute the new step. */ + dcstep(&stx, &fxm, &gxm, &sty, &fym, &gym, stp, &fm, &gm, &brackt, &stmin, &stmax); + /* Reset the function and derivative values for f. */ + fx = fxm + stx * gtest; + fy = fym + sty * gtest; + gx = gxm + gtest; + gy = gym + gtest; + } + else { + /* Call dcstep to update stx, sty, and to compute the new step. */ + dcstep(&stx, &fx, &gx, &sty, &fy, &gy, stp, f, g, &brackt, &stmin, &stmax); + } + /* Decide if a bisection step is needed. */ + if (brackt) { + if ((d__1 = sty - stx, ABS(d__1)) >= width1 * .66) { *stp = stx + (sty - stx) * .5; } + width1 = width; + width = (d__1 = sty - stx, ABS(d__1)); + } + /* Set the minimum and maximum steps allowed for stp. */ + if (brackt) { + stmin = MIN(stx, sty); + stmax = MAX(stx, sty); + } + else { + stmin = *stp + (*stp - stx) * 1.1; + stmax = *stp + (*stp - stx) * 4.; + } + /* Force the step to be within the bounds stpmax and stpmin. */ + *stp = MAX(*stp, *stpmin); + *stp = MIN(*stp, *stpmax); + /* If further progress is not possible, let stp be the best */ + /* point obtained during the search. */ + if ((brackt && (*stp <= stmin || *stp >= stmax)) || (brackt && stmax - stmin <= *xtol * stmax)) { + *stp = stx; + } + /* Obtain another function and derivative. */ + s_copy(task, "FG", task_len, (ftnlen)2); +L10: + /* Save local variables. */ + if (brackt) { isave[1] = 1; } + else { + isave[1] = 0; + } + isave[2] = stage; + dsave[1] = ginit; + dsave[2] = gtest; + dsave[3] = gx; + dsave[4] = gy; + dsave[5] = finit; + dsave[6] = fx; + dsave[7] = fy; + dsave[8] = stx; + dsave[9] = sty; + dsave[10] = stmin; + dsave[11] = stmax; + dsave[12] = width; + dsave[13] = width1; + return 0; +} /* dcsrch */ + +/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ +/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ +/* Subroutine */ int dcstep(doublereal *stx, doublereal *fx, doublereal *dx, doublereal *sty, + doublereal *fy, doublereal *dy, doublereal *stp, doublereal *fp, + doublereal *dp, logical *brackt, doublereal *stpmin, + doublereal *stpmax) { + /* System generated locals */ + doublereal d__1, d__2, d__3; + + /* Local variables */ + doublereal sgnd, stpc, stpf, stpq, p, q, gamma, r__, s, theta; + + /* ********** */ + + /* Subroutine dcstep */ + + /* This subroutine computes a safeguarded step for a search */ + /* procedure and updates an interval that contains a step that */ + /* satisfies a sufficient decrease and a curvature condition. */ + + /* The parameter stx contains the step with the least function */ + /* value. If brackt is set to .true. then a minimizer has */ + /* been bracketed in an interval with endpoints stx and sty. */ + /* The parameter stp contains the current step. */ + /* The subroutine assumes that if brackt is set to .true. then */ + + /* MIN(stx,sty) < stp < MAX(stx,sty), */ + + /* and that the derivative at stx is negative in the direction */ + /* of the step. */ + + /* The subroutine statement is */ + + /* subroutine dcstep(stx,fx,dx,sty,fy,dy,stp,fp,dp,brackt, */ + /* stpmin,stpmax) */ + + /* where */ + + /* stx is a double precision variable. */ + /* On entry stx is the best step obtained so far and is an */ + /* endpoint of the interval that contains the minimizer. */ + /* On exit stx is the updated best step. */ + + /* fx is a double precision variable. */ + /* On entry fx is the function at stx. */ + /* On exit fx is the function at stx. */ + + /* dx is a double precision variable. */ + /* On entry dx is the derivative of the function at */ + /* stx. The derivative must be negative in the direction of */ + /* the step, that is, dx and stp - stx must have opposite */ + /* signs. */ + /* On exit dx is the derivative of the function at stx. */ + + /* sty is a double precision variable. */ + /* On entry sty is the second endpoint of the interval that */ + /* contains the minimizer. */ + /* On exit sty is the updated endpoint of the interval that */ + /* contains the minimizer. */ + + /* fy is a double precision variable. */ + /* On entry fy is the function at sty. */ + /* On exit fy is the function at sty. */ + + /* dy is a double precision variable. */ + /* On entry dy is the derivative of the function at sty. */ + /* On exit dy is the derivative of the function at the exit sty. */ + + /* stp is a double precision variable. */ + /* On entry stp is the current step. If brackt is set to .true. */ + /* then on input stp must be between stx and sty. */ + /* On exit stp is a new trial step. */ + + /* fp is a double precision variable. */ + /* On entry fp is the function at stp */ + /* On exit fp is unchanged. */ + + /* dp is a double precision variable. */ + /* On entry dp is the the derivative of the function at stp. */ + /* On exit dp is unchanged. */ + + /* brackt is an logical variable. */ + /* On entry brackt specifies if a minimizer has been bracketed. */ + /* Initially brackt must be set to .false. */ + /* On exit brackt specifies if a minimizer has been bracketed. */ + /* When a minimizer is bracketed brackt is set to .true. */ + + /* stpmin is a double precision variable. */ + /* On entry stpmin is a lower bound for the step. */ + /* On exit stpmin is unchanged. */ + + /* stpmax is a double precision variable. */ + /* On entry stpmax is an upper bound for the step. */ + /* On exit stpmax is unchanged. */ + + /* MINPACK-1 Project. June 1983 */ + /* Argonne National Laboratory. */ + /* Jorge J. More' and David J. Thuente. */ + + /* MINPACK-2 Project. November 1993. */ + /* Argonne National Laboratory and University of Minnesota. */ + /* Brett M. Averick and Jorge J. More'. */ + + /* ********** */ + sgnd = *dp * (*dx / ABS(*dx)); + /* First case: A higher function value. The minimum is bracketed. */ + /* If the cubic step is closer to stx than the quadratic step, the */ + /* cubic step is taken, otherwise the average of the cubic and */ + /* quadratic steps is taken. */ + if (*fp > *fx) { + theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; + /* Computing MAX */ + d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1, d__2), d__2 = ABS(*dp); + s = MAX(d__1, d__2); + /* Computing 2nd power */ + d__1 = theta / s; + gamma = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s)); + if (*stp < *stx) { gamma = -gamma; } + p = gamma - *dx + theta; + q = gamma - *dx + gamma + *dp; + r__ = p / q; + stpc = *stx + r__ * (*stp - *stx); + stpq = *stx + *dx / ((*fx - *fp) / (*stp - *stx) + *dx) / 2. * (*stp - *stx); + if ((d__1 = stpc - *stx, ABS(d__1)) < (d__2 = stpq - *stx, ABS(d__2))) { stpf = stpc; } + else { + stpf = stpc + (stpq - stpc) / 2.; } -/* Test for convergence. */ - if (*f <= ftest && ABS(*g) <= *gtol * (-ginit)) { - s_copy(task, "CONVERGENCE", task_len, (ftnlen)11); + *brackt = TRUE_; + /* Second case: A lower function value and derivatives of opposite */ + /* sign. The minimum is bracketed. If the cubic step is farther from */ + /* stp than the secant step, the cubic step is taken, otherwise the */ + /* secant step is taken. */ + } + else if (sgnd < 0.) { + theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; + /* Computing MAX */ + d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1, d__2), d__2 = ABS(*dp); + s = MAX(d__1, d__2); + /* Computing 2nd power */ + d__1 = theta / s; + gamma = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s)); + if (*stp > *stx) { gamma = -gamma; } + p = gamma - *dp + theta; + q = gamma - *dp + gamma + *dx; + r__ = p / q; + stpc = *stp + r__ * (*stx - *stp); + stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp); + if ((d__1 = stpc - *stp, ABS(d__1)) > (d__2 = stpq - *stp, ABS(d__2))) { stpf = stpc; } + else { + stpf = stpq; } -/* Test for termination. */ - if (s_cmp(task, "WARN", (ftnlen)4, (ftnlen)4) == 0 || s_cmp(task, "CONV", - (ftnlen)4, (ftnlen)4) == 0) { - goto L10; + *brackt = TRUE_; + /* Third case: A lower function value, derivatives of the same sign, */ + /* and the magnitude of the derivative decreases. */ + } + else if (ABS(*dp) < ABS(*dx)) { + /* The cubic step is computed only if the cubic tends to infinity */ + /* in the direction of the step or if the minimum of the cubic */ + /* is beyond stp. Otherwise the cubic step is defined to be the */ + /* secant step. */ + theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; + /* Computing MAX */ + d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1, d__2), d__2 = ABS(*dp); + s = MAX(d__1, d__2); + /* The case gamma = 0 only arises if the cubic does not tend */ + /* to infinity in the direction of the step. */ + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = theta / s; + d__1 = 0., d__2 = d__3 * d__3 - *dx / s * (*dp / s); + gamma = s * sqrt((MAX(d__1, d__2))); + if (*stp > *stx) { gamma = -gamma; } + p = gamma - *dp + theta; + q = gamma + (*dx - *dp) + gamma; + r__ = p / q; + if (r__ < 0. && gamma != 0.) { stpc = *stp + r__ * (*stx - *stp); } + else if (*stp > *stx) { + stpc = *stpmax; } -/* A modified function is used to predict the step during the */ -/* first stage if a lower function value has been obtained but */ -/* the decrease is not sufficient. */ - if (stage == 1 && *f <= fx && *f > ftest) { -/* Define the modified function and derivative values. */ - fm = *f - *stp * gtest; - fxm = fx - stx * gtest; - fym = fy - sty * gtest; - gm = *g - gtest; - gxm = gx - gtest; - gym = gy - gtest; -/* Call dcstep to update stx, sty, and to compute the new step. */ - dcstep(&stx, &fxm, &gxm, &sty, &fym, &gym, stp, &fm, &gm, &brackt, & - stmin, &stmax); -/* Reset the function and derivative values for f. */ - fx = fxm + stx * gtest; - fy = fym + sty * gtest; - gx = gxm + gtest; - gy = gym + gtest; - } else { -/* Call dcstep to update stx, sty, and to compute the new step. */ - dcstep(&stx, &fx, &gx, &sty, &fy, &gy, stp, f, g, &brackt, &stmin, & - stmax); + else { + stpc = *stpmin; } -/* Decide if a bisection step is needed. */ - if (brackt) { - if ((d__1 = sty - stx, ABS(d__1)) >= width1 * .66) { - *stp = stx + (sty - stx) * .5; - } - width1 = width; - width = (d__1 = sty - stx, ABS(d__1)); + stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp); + if (*brackt) { + /* A minimizer has been bracketed. If the cubic step is */ + /* closer to stp than the secant step, the cubic step is */ + /* taken, otherwise the secant step is taken. */ + if ((d__1 = stpc - *stp, ABS(d__1)) < (d__2 = stpq - *stp, ABS(d__2))) { stpf = stpc; } + else { + stpf = stpq; + } + if (*stp > *stx) { + /* Computing MIN */ + d__1 = *stp + (*sty - *stp) * .66; + stpf = MIN(d__1, stpf); + } + else { + /* Computing MAX */ + d__1 = *stp + (*sty - *stp) * .66; + stpf = MAX(d__1, stpf); + } } -/* Set the minimum and maximum steps allowed for stp. */ - if (brackt) { - stmin = MIN(stx,sty); - stmax = MAX(stx,sty); - } else { - stmin = *stp + (*stp - stx) * 1.1; - stmax = *stp + (*stp - stx) * 4.; + else { + /* A minimizer has not been bracketed. If the cubic step is */ + /* farther from stp than the secant step, the cubic step is */ + /* taken, otherwise the secant step is taken. */ + if ((d__1 = stpc - *stp, ABS(d__1)) > (d__2 = stpq - *stp, ABS(d__2))) { stpf = stpc; } + else { + stpf = stpq; + } + stpf = MIN(*stpmax, stpf); + stpf = MAX(*stpmin, stpf); } -/* Force the step to be within the bounds stpmax and stpmin. */ - *stp = MAX(*stp,*stpmin); - *stp = MIN(*stp,*stpmax); -/* If further progress is not possible, let stp be the best */ -/* point obtained during the search. */ - if ((brackt && (*stp <= stmin || *stp >= stmax)) || - (brackt && stmax - stmin <= *xtol * stmax)) { - *stp = stx; + /* Fourth case: A lower function value, derivatives of the same sign, */ + /* and the magnitude of the derivative does not decrease. If the */ + /* minimum is not bracketed, the step is either stpmin or stpmax, */ + /* otherwise the cubic step is taken. */ + } + else { + if (*brackt) { + theta = (*fp - *fy) * 3. / (*sty - *stp) + *dy + *dp; + /* Computing MAX */ + d__1 = ABS(theta), d__2 = ABS(*dy), d__1 = MAX(d__1, d__2), d__2 = ABS(*dp); + s = MAX(d__1, d__2); + /* Computing 2nd power */ + d__1 = theta / s; + gamma = s * sqrt(d__1 * d__1 - *dy / s * (*dp / s)); + if (*stp > *sty) { gamma = -gamma; } + p = gamma - *dp + theta; + q = gamma - *dp + gamma + *dy; + r__ = p / q; + stpc = *stp + r__ * (*sty - *stp); + stpf = stpc; } -/* Obtain another function and derivative. */ - s_copy(task, "FG", task_len, (ftnlen)2); -L10: -/* Save local variables. */ - if (brackt) { - isave[1] = 1; - } else { - isave[1] = 0; + else if (*stp > *stx) { + stpf = *stpmax; } - isave[2] = stage; - dsave[1] = ginit; - dsave[2] = gtest; - dsave[3] = gx; - dsave[4] = gy; - dsave[5] = finit; - dsave[6] = fx; - dsave[7] = fy; - dsave[8] = stx; - dsave[9] = sty; - dsave[10] = stmin; - dsave[11] = stmax; - dsave[12] = width; - dsave[13] = width1; - return 0; -} /* dcsrch */ - -/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ -/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */ -/* Subroutine */ int dcstep(doublereal *stx, doublereal *fx, doublereal *dx, - doublereal *sty, doublereal *fy, doublereal *dy, doublereal *stp, - doublereal *fp, doublereal *dp, logical *brackt, doublereal *stpmin, - doublereal *stpmax) -{ - /* System generated locals */ - doublereal d__1, d__2, d__3; - - /* Local variables */ - doublereal sgnd, stpc, stpf, stpq, p, q, gamma, r__, s, theta; - -/* ********** */ - -/* Subroutine dcstep */ - -/* This subroutine computes a safeguarded step for a search */ -/* procedure and updates an interval that contains a step that */ -/* satisfies a sufficient decrease and a curvature condition. */ - -/* The parameter stx contains the step with the least function */ -/* value. If brackt is set to .true. then a minimizer has */ -/* been bracketed in an interval with endpoints stx and sty. */ -/* The parameter stp contains the current step. */ -/* The subroutine assumes that if brackt is set to .true. then */ - -/* MIN(stx,sty) < stp < MAX(stx,sty), */ - -/* and that the derivative at stx is negative in the direction */ -/* of the step. */ - -/* The subroutine statement is */ - -/* subroutine dcstep(stx,fx,dx,sty,fy,dy,stp,fp,dp,brackt, */ -/* stpmin,stpmax) */ - -/* where */ - -/* stx is a double precision variable. */ -/* On entry stx is the best step obtained so far and is an */ -/* endpoint of the interval that contains the minimizer. */ -/* On exit stx is the updated best step. */ - -/* fx is a double precision variable. */ -/* On entry fx is the function at stx. */ -/* On exit fx is the function at stx. */ - -/* dx is a double precision variable. */ -/* On entry dx is the derivative of the function at */ -/* stx. The derivative must be negative in the direction of */ -/* the step, that is, dx and stp - stx must have opposite */ -/* signs. */ -/* On exit dx is the derivative of the function at stx. */ - -/* sty is a double precision variable. */ -/* On entry sty is the second endpoint of the interval that */ -/* contains the minimizer. */ -/* On exit sty is the updated endpoint of the interval that */ -/* contains the minimizer. */ - -/* fy is a double precision variable. */ -/* On entry fy is the function at sty. */ -/* On exit fy is the function at sty. */ - -/* dy is a double precision variable. */ -/* On entry dy is the derivative of the function at sty. */ -/* On exit dy is the derivative of the function at the exit sty. */ - -/* stp is a double precision variable. */ -/* On entry stp is the current step. If brackt is set to .true. */ -/* then on input stp must be between stx and sty. */ -/* On exit stp is a new trial step. */ - -/* fp is a double precision variable. */ -/* On entry fp is the function at stp */ -/* On exit fp is unchanged. */ - -/* dp is a double precision variable. */ -/* On entry dp is the the derivative of the function at stp. */ -/* On exit dp is unchanged. */ - -/* brackt is an logical variable. */ -/* On entry brackt specifies if a minimizer has been bracketed. */ -/* Initially brackt must be set to .false. */ -/* On exit brackt specifies if a minimizer has been bracketed. */ -/* When a minimizer is bracketed brackt is set to .true. */ - -/* stpmin is a double precision variable. */ -/* On entry stpmin is a lower bound for the step. */ -/* On exit stpmin is unchanged. */ - -/* stpmax is a double precision variable. */ -/* On entry stpmax is an upper bound for the step. */ -/* On exit stpmax is unchanged. */ - -/* MINPACK-1 Project. June 1983 */ -/* Argonne National Laboratory. */ -/* Jorge J. More' and David J. Thuente. */ - -/* MINPACK-2 Project. November 1993. */ -/* Argonne National Laboratory and University of Minnesota. */ -/* Brett M. Averick and Jorge J. More'. */ - -/* ********** */ - sgnd = *dp * (*dx / ABS(*dx)); -/* First case: A higher function value. The minimum is bracketed. */ -/* If the cubic step is closer to stx than the quadratic step, the */ -/* cubic step is taken, otherwise the average of the cubic and */ -/* quadratic steps is taken. */ - if (*fp > *fx) { - theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; -/* Computing MAX */ - d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1,d__2), d__2 = ABS( - *dp); - s = MAX(d__1,d__2); -/* Computing 2nd power */ - d__1 = theta / s; - gamma = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s)); - if (*stp < *stx) { - gamma = -gamma; - } - p = gamma - *dx + theta; - q = gamma - *dx + gamma + *dp; - r__ = p / q; - stpc = *stx + r__ * (*stp - *stx); - stpq = *stx + *dx / ((*fx - *fp) / (*stp - *stx) + *dx) / 2. * (*stp - - *stx); - if ((d__1 = stpc - *stx, ABS(d__1)) < (d__2 = stpq - *stx, ABS(d__2))) - { - stpf = stpc; - } else { - stpf = stpc + (stpq - stpc) / 2.; - } - *brackt = TRUE_; -/* Second case: A lower function value and derivatives of opposite */ -/* sign. The minimum is bracketed. If the cubic step is farther from */ -/* stp than the secant step, the cubic step is taken, otherwise the */ -/* secant step is taken. */ - } else if (sgnd < 0.) { - theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; -/* Computing MAX */ - d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1,d__2), d__2 = ABS( - *dp); - s = MAX(d__1,d__2); -/* Computing 2nd power */ - d__1 = theta / s; - gamma = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s)); - if (*stp > *stx) { - gamma = -gamma; - } - p = gamma - *dp + theta; - q = gamma - *dp + gamma + *dx; - r__ = p / q; - stpc = *stp + r__ * (*stx - *stp); - stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp); - if ((d__1 = stpc - *stp, ABS(d__1)) > (d__2 = stpq - *stp, ABS(d__2))) - { - stpf = stpc; - } else { - stpf = stpq; - } - *brackt = TRUE_; -/* Third case: A lower function value, derivatives of the same sign, */ -/* and the magnitude of the derivative decreases. */ - } else if (ABS(*dp) < ABS(*dx)) { -/* The cubic step is computed only if the cubic tends to infinity */ -/* in the direction of the step or if the minimum of the cubic */ -/* is beyond stp. Otherwise the cubic step is defined to be the */ -/* secant step. */ - theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp; -/* Computing MAX */ - d__1 = ABS(theta), d__2 = ABS(*dx), d__1 = MAX(d__1,d__2), d__2 = ABS( - *dp); - s = MAX(d__1,d__2); -/* The case gamma = 0 only arises if the cubic does not tend */ -/* to infinity in the direction of the step. */ -/* Computing MAX */ -/* Computing 2nd power */ - d__3 = theta / s; - d__1 = 0., d__2 = d__3 * d__3 - *dx / s * (*dp / s); - gamma = s * sqrt((MAX(d__1,d__2))); - if (*stp > *stx) { - gamma = -gamma; - } - p = gamma - *dp + theta; - q = gamma + (*dx - *dp) + gamma; - r__ = p / q; - if (r__ < 0. && gamma != 0.) { - stpc = *stp + r__ * (*stx - *stp); - } else if (*stp > *stx) { - stpc = *stpmax; - } else { - stpc = *stpmin; - } - stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp); - if (*brackt) { -/* A minimizer has been bracketed. If the cubic step is */ -/* closer to stp than the secant step, the cubic step is */ -/* taken, otherwise the secant step is taken. */ - if ((d__1 = stpc - *stp, ABS(d__1)) < (d__2 = stpq - *stp, ABS( - d__2))) { - stpf = stpc; - } else { - stpf = stpq; - } - if (*stp > *stx) { -/* Computing MIN */ - d__1 = *stp + (*sty - *stp) * .66; - stpf = MIN(d__1,stpf); - } else { -/* Computing MAX */ - d__1 = *stp + (*sty - *stp) * .66; - stpf = MAX(d__1,stpf); - } - } else { -/* A minimizer has not been bracketed. If the cubic step is */ -/* farther from stp than the secant step, the cubic step is */ -/* taken, otherwise the secant step is taken. */ - if ((d__1 = stpc - *stp, ABS(d__1)) > (d__2 = stpq - *stp, ABS( - d__2))) { - stpf = stpc; - } else { - stpf = stpq; - } - stpf = MIN(*stpmax,stpf); - stpf = MAX(*stpmin,stpf); - } -/* Fourth case: A lower function value, derivatives of the same sign, */ -/* and the magnitude of the derivative does not decrease. If the */ -/* minimum is not bracketed, the step is either stpmin or stpmax, */ -/* otherwise the cubic step is taken. */ - } else { - if (*brackt) { - theta = (*fp - *fy) * 3. / (*sty - *stp) + *dy + *dp; -/* Computing MAX */ - d__1 = ABS(theta), d__2 = ABS(*dy), d__1 = MAX(d__1,d__2), d__2 = - ABS(*dp); - s = MAX(d__1,d__2); -/* Computing 2nd power */ - d__1 = theta / s; - gamma = s * sqrt(d__1 * d__1 - *dy / s * (*dp / s)); - if (*stp > *sty) { - gamma = -gamma; - } - p = gamma - *dp + theta; - q = gamma - *dp + gamma + *dy; - r__ = p / q; - stpc = *stp + r__ * (*sty - *stp); - stpf = stpc; - } else if (*stp > *stx) { - stpf = *stpmax; - } else { - stpf = *stpmin; - } + else { + stpf = *stpmin; } -/* Update the interval which contains a minimizer. */ - if (*fp > *fx) { - *sty = *stp; - *fy = *fp; - *dy = *dp; - } else { - if (sgnd < 0.) { - *sty = *stx; - *fy = *fx; - *dy = *dx; - } - *stx = *stp; - *fx = *fp; - *dx = *dp; + } + /* Update the interval which contains a minimizer. */ + if (*fp > *fx) { + *sty = *stp; + *fy = *fp; + *dy = *dp; + } + else { + if (sgnd < 0.) { + *sty = *stx; + *fy = *fx; + *dy = *dx; } -/* Compute the new step. */ - *stp = stpf; - return 0; + *stx = *stp; + *fx = *fp; + *dx = *dp; + } + /* Compute the new step. */ + *stp = stpf; + return 0; } /* dcstep */ - diff --git a/src/matrices/scalar.h b/src/matrices/scalar.h index dbe21d06..56f3dc3e 100644 --- a/src/matrices/scalar.h +++ b/src/matrices/scalar.h @@ -35,19 +35,32 @@ typedef double real; /********************* complex types and operations **********************/ typedef struct { - real re, im; + real re, im; } scalar_complex; #define CSCALAR_NUMVALS (2) -#define CSCALAR_INIT_ZERO { 0.0, 0.0 } +#define CSCALAR_INIT_ZERO \ + { 0.0, 0.0 } #define CSCALAR_RE(a) ((a).re) #define CSCALAR_IM(a) ((a).im) -#define CASSIGN_SCALAR(a, real, imag) { (a).re = (real); (a).im = (imag); } -#define CACCUMULATE_SCALAR(a, real, imag) { (a).re +=(real); (a).im +=(imag); } -#define CACCUMULATE_DIFF_SCALAR(a, real, imag) { (a).re -=(real); (a).im -=(imag); } +#define CASSIGN_SCALAR(a, real, imag) \ + { \ + (a).re = (real); \ + (a).im = (imag); \ + } +#define CACCUMULATE_SCALAR(a, real, imag) \ + { \ + (a).re += (real); \ + (a).im += (imag); \ + } +#define CACCUMULATE_DIFF_SCALAR(a, real, imag) \ + { \ + (a).re -= (real); \ + (a).im -= (imag); \ + } #define CASSIGN_ZERO(a) CASSIGN_SCALAR(a, 0.0, 0.0); #define CASSIGN_REAL(a, c) CASSIGN_SCALAR(a, c, 0.0) @@ -56,21 +69,24 @@ typedef struct { #define CSCALAR_NORMSQR(a) ((a).re * (a).re + (a).im * (a).im) /* a = b * c */ -#define CASSIGN_MULT(a, b, c) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - real cccc_re = (c).re, cccc_im = (c).im; \ - CASSIGN_SCALAR(a, bbbb_re * cccc_re - bbbb_im * cccc_im, \ - bbbb_re * cccc_im + bbbb_im * cccc_re); \ -} +#define CASSIGN_MULT(a, b, c) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + real cccc_re = (c).re, cccc_im = (c).im; \ + CASSIGN_SCALAR(a, bbbb_re *cccc_re - bbbb_im * cccc_im, \ + bbbb_re * cccc_im + bbbb_im * cccc_re); \ + } /* a = b / c = b * conj(c) / |c|^2 */ -#define CASSIGN_DIV(a, b, c) { \ - scalar_complex aaaa_tmp; real aaaa_tmp_norm; \ - CASSIGN_CONJ(aaaa_tmp, c); \ - aaaa_tmp_norm = 1.0 / CSCALAR_NORMSQR(aaaa_tmp); \ - CASSIGN_MULT(aaaa_tmp, b, aaaa_tmp); \ - CASSIGN_SCALAR(a, aaaa_tmp.re*aaaa_tmp_norm, aaaa_tmp.im*aaaa_tmp_norm); \ -} +#define CASSIGN_DIV(a, b, c) \ + { \ + scalar_complex aaaa_tmp; \ + real aaaa_tmp_norm; \ + CASSIGN_CONJ(aaaa_tmp, c); \ + aaaa_tmp_norm = 1.0 / CSCALAR_NORMSQR(aaaa_tmp); \ + CASSIGN_MULT(aaaa_tmp, b, aaaa_tmp); \ + CASSIGN_SCALAR(a, aaaa_tmp.re *aaaa_tmp_norm, aaaa_tmp.im *aaaa_tmp_norm); \ + } /* Re (b * c) */ #define CSCALAR_MULT_RE(b, c) ((b).re * (c).re - (b).im * (c).im) @@ -85,43 +101,48 @@ typedef struct { #define CSCALAR_MULT_CONJ_IM(b, c) ((b).im * (c).re - (b).re * (c).im) /* a = Re (b * c) */ -#define CASSIGN_MULT_RE(a, b, c) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - real cccc_re = (c).re, cccc_im = (c).im; \ - (a) = bbbb_re * cccc_re - bbbb_im * cccc_im; \ -} +#define CASSIGN_MULT_RE(a, b, c) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + real cccc_re = (c).re, cccc_im = (c).im; \ + (a) = bbbb_re * cccc_re - bbbb_im * cccc_im; \ + } /* a = Im (b * c) */ -#define CASSIGN_MULT_IM(a, b, c) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - real cccc_re = (c).re, cccc_im = (c).im; \ - (a) = bbbb_re * cccc_im + bbbb_im * cccc_re; \ -} +#define CASSIGN_MULT_IM(a, b, c) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + real cccc_re = (c).re, cccc_im = (c).im; \ + (a) = bbbb_re * cccc_im + bbbb_im * cccc_re; \ + } /* a += b * c */ -#define CACCUMULATE_SUM_MULT(a, b, c) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - real cccc_re = (c).re, cccc_im = (c).im; \ - CACCUMULATE_SCALAR(a, bbbb_re * cccc_re - bbbb_im * cccc_im, \ - bbbb_re * cccc_im + bbbb_im * cccc_re); \ -} +#define CACCUMULATE_SUM_MULT(a, b, c) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + real cccc_re = (c).re, cccc_im = (c).im; \ + CACCUMULATE_SCALAR(a, bbbb_re *cccc_re - bbbb_im * cccc_im, \ + bbbb_re * cccc_im + bbbb_im * cccc_re); \ + } /* a += conj(b) * c */ -#define CACCUMULATE_SUM_CONJ_MULT(a, b, c) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - real cccc_re = (c).re, cccc_im = (c).im; \ - CACCUMULATE_SCALAR(a, bbbb_re * cccc_re + bbbb_im * cccc_im, \ - bbbb_re * cccc_im - bbbb_im * cccc_re); \ -} +#define CACCUMULATE_SUM_CONJ_MULT(a, b, c) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + real cccc_re = (c).re, cccc_im = (c).im; \ + CACCUMULATE_SCALAR(a, bbbb_re *cccc_re + bbbb_im * cccc_im, \ + bbbb_re * cccc_im - bbbb_im * cccc_re); \ + } /* a += |b|^2 */ -#define CACCUMULATE_SUM_SQ(a, b) { \ - real bbbb_re = (b).re, bbbb_im = (b).im; \ - (a) += bbbb_re * bbbb_re + bbbb_im * bbbb_im; \ -} +#define CACCUMULATE_SUM_SQ(a, b) \ + { \ + real bbbb_re = (b).re, bbbb_im = (b).im; \ + (a) += bbbb_re * bbbb_re + bbbb_im * bbbb_im; \ + } -#define CACCUMULATE_SUM(sum, a) CACCUMULATE_SCALAR(sum,CSCALAR_RE(a),CSCALAR_IM(a)) -#define CACCUMULATE_DIFF(sum, a) CACCUMULATE_DIFF_SCALAR(sum,CSCALAR_RE(a),CSCALAR_IM(a)) +#define CACCUMULATE_SUM(sum, a) CACCUMULATE_SCALAR(sum, CSCALAR_RE(a), CSCALAR_IM(a)) +#define CACCUMULATE_DIFF(sum, a) CACCUMULATE_DIFF_SCALAR(sum, CSCALAR_RE(a), CSCALAR_IM(a)) /************************** scalars are complex **************************/ #ifdef SCALAR_COMPLEX @@ -200,18 +221,22 @@ typedef real scalar; #define ACCUMULATE_SUM_MULT(a, b, c) (a) += (b) * (c); #define ACCUMULATE_SUM_CONJ_MULT(a, b, c) (a) += (b) * (c); -#define ACCUMULATE_SUM_SQ(a, b) { real bbbb = (b); (a) += bbbb * bbbb; } +#define ACCUMULATE_SUM_SQ(a, b) \ + { \ + real bbbb = (b); \ + (a) += bbbb * bbbb; \ + } #endif /* scalars are real */ #define ASSIGN_ZERO(a) ASSIGN_SCALAR(a, 0.0, 0.0); #define ASSIGN_REAL(a, c) ASSIGN_SCALAR(a, c, 0.0) #define ASSIGN_CONJ(a, b) ASSIGN_SCALAR(a, SCALAR_RE(b), -SCALAR_IM(b)) -#define ACCUMULATE_SUM(sum, a) ACCUMULATE_SCALAR(sum,SCALAR_RE(a),SCALAR_IM(a)) -#define ACCUMULATE_DIFF(sum, a) ACCUMULATE_DIFF_SCALAR(sum,SCALAR_RE(a),SCALAR_IM(a)) +#define ACCUMULATE_SUM(sum, a) ACCUMULATE_SCALAR(sum, SCALAR_RE(a), SCALAR_IM(a)) +#define ACCUMULATE_DIFF(sum, a) ACCUMULATE_DIFF_SCALAR(sum, SCALAR_RE(a), SCALAR_IM(a)) #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif /* __cplusplus */ #endif /* SCALAR_H */ diff --git a/src/matrices/sqmatrix.c b/src/matrices/sqmatrix.c index 49b9531c..9cf0ed6a 100644 --- a/src/matrices/sqmatrix.c +++ b/src/matrices/sqmatrix.c @@ -32,36 +32,33 @@ static double max2(double a, double b) { return a > b ? a : b; } #endif -void sqmatrix_assert_hermitian(sqmatrix A) -{ +void sqmatrix_assert_hermitian(sqmatrix A) { #ifdef DEBUG - double err = 0, maxsq = 0; - int i, j, p = A.p; - - for (i = 0; i < p; ++i) - for (j = 0; j < p; ++j) - maxsq = max2(maxsq, SCALAR_NORMSQR(A.data[i*p + j])); - for (i = 0; i < p; ++i) { - err = max2(err, (SCALAR_IM(A.data[i*p + i]) * - SCALAR_IM(A.data[i*p + i])) / maxsq); - for (j = i + 1; j < p; ++j) { - scalar x; - ASSIGN_CONJ(x, A.data[i*p + j]); - ACCUMULATE_DIFF(x, A.data[j*p + i]); - err = max2(err, SCALAR_NORMSQR(x) / maxsq); - } - } - CHECK(err < 1e-10, "sqmatrix_assert_hermitian failed"); + double err = 0, maxsq = 0; + int i, j, p = A.p; + + for (i = 0; i < p; ++i) + for (j = 0; j < p; ++j) + maxsq = max2(maxsq, SCALAR_NORMSQR(A.data[i * p + j])); + for (i = 0; i < p; ++i) { + err = max2(err, (SCALAR_IM(A.data[i * p + i]) * SCALAR_IM(A.data[i * p + i])) / maxsq); + for (j = i + 1; j < p; ++j) { + scalar x; + ASSIGN_CONJ(x, A.data[i * p + j]); + ACCUMULATE_DIFF(x, A.data[j * p + i]); + err = max2(err, SCALAR_NORMSQR(x) / maxsq); + } + } + CHECK(err < 1e-10, "sqmatrix_assert_hermitian failed"); #else - (void) A; /* unused */ + (void)A; /* unused */ #endif } /* A = B */ -void sqmatrix_copy(sqmatrix A, sqmatrix B) -{ - CHECK(A.p == B.p, "arrays not conformant"); - blasglue_copy(A.p * A.p, B.data, 1, A.data, 1); +void sqmatrix_copy(sqmatrix A, sqmatrix B) { + CHECK(A.p == B.p, "arrays not conformant"); + blasglue_copy(A.p * A.p, B.data, 1, A.data, 1); } /* Resize A from its current size to a pxp matrix, assuming that @@ -69,179 +66,160 @@ void sqmatrix_copy(sqmatrix A, sqmatrix B) If preserve_data is nonzero, copies the existing data in A (or a subset of it, if the matrix is shrinking) to the corresponding entries of the resized matrix. */ -void sqmatrix_resize(sqmatrix *A, int p, short preserve_data) -{ - CHECK(p <= A->alloc_p, "tried to resize beyond allocated limit"); - - if (preserve_data) { - int i, j; - - if (p < A->p) { - for (i = 0; i < p; ++i) - for (j = 0; j < p; ++j) - A->data[i*p + j] = A->data[i*A->p + j]; - } - else { - for (i = A->p-1; i >= 0; --i) - for (j = A->p-1; j >= 0; --j) - A->data[i*p + j] = A->data[i*A->p + j]; - } - } - - A->p = p; +void sqmatrix_resize(sqmatrix *A, int p, short preserve_data) { + CHECK(p <= A->alloc_p, "tried to resize beyond allocated limit"); + + if (preserve_data) { + int i, j; + + if (p < A->p) { + for (i = 0; i < p; ++i) + for (j = 0; j < p; ++j) + A->data[i * p + j] = A->data[i * A->p + j]; + } + else { + for (i = A->p - 1; i >= 0; --i) + for (j = A->p - 1; j >= 0; --j) + A->data[i * p + j] = A->data[i * A->p + j]; + } + } + + A->p = p; } -/* U contains the upper triangle of a Hermitian matrix; we copy this +/* U contains the upper triangle of a Hermitian matrix; we copy this to F and also fill in the lower triangle with the adjoint of the upper. */ -void sqmatrix_copy_upper2full(sqmatrix F, sqmatrix U) -{ - int i, j; - - CHECK(F.p == U.p, "arrays not conformant"); - for (i = 0; i < U.p; ++i) { - for (j = 0; j < i; ++j) { - ASSIGN_CONJ(F.data[i*U.p + j], U.data[j*U.p + i]); - } - for (; j < U.p; ++j) - F.data[i*U.p + j] = U.data[i*U.p + j]; - } - sqmatrix_assert_hermitian(F); +void sqmatrix_copy_upper2full(sqmatrix F, sqmatrix U) { + int i, j; + + CHECK(F.p == U.p, "arrays not conformant"); + for (i = 0; i < U.p; ++i) { + for (j = 0; j < i; ++j) { + ASSIGN_CONJ(F.data[i * U.p + j], U.data[j * U.p + i]); + } + for (; j < U.p; ++j) + F.data[i * U.p + j] = U.data[i * U.p + j]; + } + sqmatrix_assert_hermitian(F); } /* Asym = (A + adjoint(A)) / 2. Asym is thus Hermitian. */ -void sqmatrix_symmetrize(sqmatrix Asym, sqmatrix A) -{ - int i, j; - - CHECK(Asym.p == A.p, "arrays not conformant"); - - for (i = 0; i < A.p; ++i) - for (j = 0; j < A.p; ++j) { - int ij = i * A.p + j, ji = j * A.p + i; - ASSIGN_SCALAR(Asym.data[ij], - 0.5 * (SCALAR_RE(A.data[ij]) + - SCALAR_RE(A.data[ji])), - 0.5 * (SCALAR_IM(A.data[ij]) - - SCALAR_IM(A.data[ji]))); - } - sqmatrix_assert_hermitian(Asym); +void sqmatrix_symmetrize(sqmatrix Asym, sqmatrix A) { + int i, j; + + CHECK(Asym.p == A.p, "arrays not conformant"); + + for (i = 0; i < A.p; ++i) + for (j = 0; j < A.p; ++j) { + int ij = i * A.p + j, ji = j * A.p + i; + ASSIGN_SCALAR(Asym.data[ij], 0.5 * (SCALAR_RE(A.data[ij]) + SCALAR_RE(A.data[ji])), + 0.5 * (SCALAR_IM(A.data[ij]) - SCALAR_IM(A.data[ji]))); + } + sqmatrix_assert_hermitian(Asym); } /* trace(U) */ -scalar sqmatrix_trace(sqmatrix U) -{ - int i; - scalar trace = SCALAR_INIT_ZERO; +scalar sqmatrix_trace(sqmatrix U) { + int i; + scalar trace = SCALAR_INIT_ZERO; - for (i = 0; i < U.p; ++i) - ACCUMULATE_SUM(trace, U.data[i*U.p + i]); + for (i = 0; i < U.p; ++i) + ACCUMULATE_SUM(trace, U.data[i * U.p + i]); - return trace; + return trace; } /* compute trace(adjoint(A) * B) */ -scalar sqmatrix_traceAtB(sqmatrix A, sqmatrix B) -{ - scalar trace; +scalar sqmatrix_traceAtB(sqmatrix A, sqmatrix B) { + scalar trace; - CHECK(A.p == B.p, "matrices not conformant"); + CHECK(A.p == B.p, "matrices not conformant"); - trace = blasglue_dotc(A.p * A.p, A.data, 1, B.data, 1); + trace = blasglue_dotc(A.p * A.p, A.data, 1, B.data, 1); - return trace; + return trace; } -/* A = B * C. If bdagger != 0, then adjoint(B) is used; similarly for C. +/* A = B * C. If bdagger != 0, then adjoint(B) is used; similarly for C. A must be distinct from B and C. Note that since the matrices are stored in row-major order, the most efficient operation should be B * adjoint(C), assuming the BLAS is sane. i.e. if C is hermitian, you should use cdagger = 1. Conversely, the worst operation is probably adjoint(B) * C. */ -void sqmatrix_AeBC(sqmatrix A, sqmatrix B, short bdagger, - sqmatrix C, short cdagger) -{ - CHECK(A.p == B.p && A.p == C.p, "matrices not conformant"); +void sqmatrix_AeBC(sqmatrix A, sqmatrix B, short bdagger, sqmatrix C, short cdagger) { + CHECK(A.p == B.p && A.p == C.p, "matrices not conformant"); - blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p, - 1.0, B.data, B.p, C.data, C.p, 0.0, A.data, A.p); + blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p, 1.0, B.data, B.p, C.data, + C.p, 0.0, A.data, A.p); } /* A += a B * C. bdagger, cdagger are as for sqmatrix_AeBC, above. */ -void sqmatrix_ApaBC(sqmatrix A, real a, sqmatrix B, short bdagger, - sqmatrix C, short cdagger) -{ - CHECK(A.p == B.p && A.p == C.p, "matrices not conformant"); +void sqmatrix_ApaBC(sqmatrix A, real a, sqmatrix B, short bdagger, sqmatrix C, short cdagger) { + CHECK(A.p == B.p && A.p == C.p, "matrices not conformant"); - blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p, - a, B.data, B.p, C.data, C.p, 1.0, A.data, A.p); + blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p, a, B.data, B.p, C.data, + C.p, 1.0, A.data, A.p); } /* A += a B */ -void sqmatrix_ApaB(sqmatrix A, real a, sqmatrix B) -{ - CHECK(A.p == B.p, "matrices not conformant"); +void sqmatrix_ApaB(sqmatrix A, real a, sqmatrix B) { + CHECK(A.p == B.p, "matrices not conformant"); - blasglue_axpy(A.p * A.p, a, B.data, 1, A.data, 1); + blasglue_axpy(A.p * A.p, a, B.data, 1, A.data, 1); } /* compute A = a*A + b*B; A and B may be equal. */ -void sqmatrix_aApbB(real a, sqmatrix A, real b, sqmatrix B) -{ - CHECK(A.p == B.p, "arrays not conformant"); +void sqmatrix_aApbB(real a, sqmatrix A, real b, sqmatrix B) { + CHECK(A.p == B.p, "arrays not conformant"); - if (a != 1.0) - blasglue_rscal(A.p * A.p, a, A.data, 1); + if (a != 1.0) blasglue_rscal(A.p * A.p, a, A.data, 1); - blasglue_axpy(A.p * A.p, b, B.data, 1, A.data, 1); + blasglue_axpy(A.p * A.p, b, B.data, 1, A.data, 1); } /* U <- 1/U. U must be Hermitian and, if positive_definite != 0, - positive-definite (e.g. U = Yt*Y). Work is a scratch matrix. + positive-definite (e.g. U = Yt*Y). Work is a scratch matrix. Returns 1 on success, 0 if failure (e.g. matrix singular) */ -int sqmatrix_invert(sqmatrix U, short positive_definite, - sqmatrix Work) -{ - int i, j; - - sqmatrix_assert_hermitian(U); - if (positive_definite) { - /* factorize U: */ - if (!lapackglue_potrf('U', U.p, U.data, U.p)) return 0; - - /* QUESTION: would it be more efficient to stop here, - returning the Cholesky factorization of U? This - could then be used to multiply by 1/U without - ever calculating the inverse explicitly. It - would probably be more numerically stable, but - how do the computational costs compare? */ - - /* Compute 1/U (upper half only) */ - if (!lapackglue_potri('U', U.p, U.data, U.p)) return 0; - } - else { - int *ipiv; - CHK_MALLOC(ipiv, int, U.p); - - CHECK(Work.p * Work.p >= U.p, "scratch matrix is too small"); - - /* factorize U: */ - if (!lapackglue_hetrf('U', U.p, U.data, U.p, - ipiv, Work.data, Work.p * Work.p)) return 0; - /* Compute 1/U (upper half only) */ - if (!lapackglue_hetri('U', U.p, U.data, U.p, ipiv, Work.data)) - return 0; - - free(ipiv); - } - - /* Now, copy the conjugate of the upper half - onto the lower half of U */ - for (i = 0; i < U.p; ++i) - for (j = i + 1; j < U.p; ++j) { - ASSIGN_CONJ(U.data[j * U.p + i], U.data[i * U.p + j]); - } - - return 1; +int sqmatrix_invert(sqmatrix U, short positive_definite, sqmatrix Work) { + int i, j; + + sqmatrix_assert_hermitian(U); + if (positive_definite) { + /* factorize U: */ + if (!lapackglue_potrf('U', U.p, U.data, U.p)) return 0; + + /* QUESTION: would it be more efficient to stop here, + returning the Cholesky factorization of U? This + could then be used to multiply by 1/U without + ever calculating the inverse explicitly. It + would probably be more numerically stable, but + how do the computational costs compare? */ + + /* Compute 1/U (upper half only) */ + if (!lapackglue_potri('U', U.p, U.data, U.p)) return 0; + } + else { + int *ipiv; + CHK_MALLOC(ipiv, int, U.p); + + CHECK(Work.p * Work.p >= U.p, "scratch matrix is too small"); + + /* factorize U: */ + if (!lapackglue_hetrf('U', U.p, U.data, U.p, ipiv, Work.data, Work.p * Work.p)) return 0; + /* Compute 1/U (upper half only) */ + if (!lapackglue_hetri('U', U.p, U.data, U.p, ipiv, Work.data)) return 0; + + free(ipiv); + } + + /* Now, copy the conjugate of the upper half + onto the lower half of U */ + for (i = 0; i < U.p; ++i) + for (j = i + 1; j < U.p; ++j) { + ASSIGN_CONJ(U.data[j * U.p + i], U.data[i * U.p + j]); + } + + return 1; } /* U <- eigenvectors of Ux=lambda B x, while B is overwritten (by its @@ -252,95 +230,87 @@ int sqmatrix_invert(sqmatrix U, short positive_definite, adjoint(U') D U', with D = diag(eigenvals). The eigenvalues are returned in ascending order. */ -void sqmatrix_gen_eigensolve(sqmatrix U, sqmatrix B, real *eigenvals, sqmatrix W) -{ - real *work; - scalar *morework; - int nwork; - - sqmatrix_assert_hermitian(U); - CHK_MALLOC(work, real, 3*U.p - 2); - if (W.p * W.p >= 3 * U.p - 1) { - morework = W.data; - nwork = W.p * W.p; - } - else { - CHK_MALLOC(morework, scalar, 3 * U.p - 1); - nwork = 3 * U.p - 1; - } - if (B.data) { - CHECK(B.p == U.p, "mismatched matrix sizes in sqmatrix_eigensolve"); - sqmatrix_assert_hermitian(B); - lapackglue_hegv(1, 'V', 'U', U.p, U.data, B.p, B.data, U.p, eigenvals, - morework, nwork, work); - } - else { - lapackglue_heev('V', 'U', U.p, U.data, U.p, eigenvals, - morework, nwork, work); - } - - if (morework != W.data) free(morework); - free(work); +void sqmatrix_gen_eigensolve(sqmatrix U, sqmatrix B, real *eigenvals, sqmatrix W) { + real *work; + scalar *morework; + int nwork; + + sqmatrix_assert_hermitian(U); + CHK_MALLOC(work, real, 3 * U.p - 2); + if (W.p * W.p >= 3 * U.p - 1) { + morework = W.data; + nwork = W.p * W.p; + } + else { + CHK_MALLOC(morework, scalar, 3 * U.p - 1); + nwork = 3 * U.p - 1; + } + if (B.data) { + CHECK(B.p == U.p, "mismatched matrix sizes in sqmatrix_eigensolve"); + sqmatrix_assert_hermitian(B); + lapackglue_hegv(1, 'V', 'U', U.p, U.data, B.p, B.data, U.p, eigenvals, morework, nwork, work); + } + else { + lapackglue_heev('V', 'U', U.p, U.data, U.p, eigenvals, morework, nwork, work); + } + + if (morework != W.data) free(morework); + free(work); } -void sqmatrix_eigensolve(sqmatrix U, real *eigenvals, sqmatrix W) -{ - sqmatrix B; - B.data = NULL; - sqmatrix_gen_eigensolve(U, B, eigenvals, W); +void sqmatrix_eigensolve(sqmatrix U, real *eigenvals, sqmatrix W) { + sqmatrix B; + B.data = NULL; + sqmatrix_gen_eigensolve(U, B, eigenvals, W); } /* Compute eigenvalues of a general (non-Hermitian) matrix A. Does not compute the eigenvectors. */ -void sqmatrix_eigenvalues(sqmatrix A, scalar_complex *eigvals) -{ - sqmatrix B; /* make a copy of A, since geev overwrites array */ - scalar *work, work1; - real *rwork; - int lwork; - B = create_sqmatrix(A.p); - sqmatrix_copy(B, A); - CHK_MALLOC(rwork, real, 2*A.p); - lapackglue_geev('N','N', A.p, B.data, A.p, eigvals, NULL,1,NULL,1, - &work1, -1, rwork); - lwork = (int) (SCALAR_RE(work1) + 0.5); - CHK_MALLOC(work, scalar, lwork); - lapackglue_geev('N','N', A.p, B.data, A.p, eigvals, NULL,1,NULL,1, - work, lwork, rwork); - free(work); - free(rwork); - destroy_sqmatrix(B); +void sqmatrix_eigenvalues(sqmatrix A, scalar_complex *eigvals) { + sqmatrix B; /* make a copy of A, since geev overwrites array */ + scalar *work, work1; + real *rwork; + int lwork; + B = create_sqmatrix(A.p); + sqmatrix_copy(B, A); + CHK_MALLOC(rwork, real, 2 * A.p); + lapackglue_geev('N', 'N', A.p, B.data, A.p, eigvals, NULL, 1, NULL, 1, &work1, -1, rwork); + lwork = (int)(SCALAR_RE(work1) + 0.5); + CHK_MALLOC(work, scalar, lwork); + lapackglue_geev('N', 'N', A.p, B.data, A.p, eigvals, NULL, 1, NULL, 1, work, lwork, rwork); + free(work); + free(rwork); + destroy_sqmatrix(B); } -/* Compute Usqrt <- sqrt(U), where U must be Hermitian positive-definite. +/* Compute Usqrt <- sqrt(U), where U must be Hermitian positive-definite. W is a work array, and must be the same size as U. Both U and W are overwritten. */ -void sqmatrix_sqrt(sqmatrix Usqrt, sqmatrix U, sqmatrix W) -{ - real *eigenvals; - - sqmatrix_assert_hermitian(U); - CHECK(Usqrt.p == U.p && U.p == W.p, "matrices not conformant"); - - CHK_MALLOC(eigenvals, real, U.p); - - sqmatrix_eigensolve(U, eigenvals, W); - - { - int i; - - /* Compute W = diag(sqrt(eigenvals)) * U; i.e. the rows of W - become the rows of U times sqrt(corresponding eigenvalue) */ - for (i = 0; i < U.p; ++i) { - CHECK(eigenvals[i] > 0, "non-positive eigenvalue"); - - blasglue_copy(U.p, U.data + i*U.p, 1, W.data + i*U.p, 1); - blasglue_rscal(U.p, sqrt(eigenvals[i]), W.data + i*U.p, 1); - } - } - - free(eigenvals); - - /* compute Usqrt = Ut * W */ - sqmatrix_AeBC(Usqrt, U, 1, W, 0); +void sqmatrix_sqrt(sqmatrix Usqrt, sqmatrix U, sqmatrix W) { + real *eigenvals; + + sqmatrix_assert_hermitian(U); + CHECK(Usqrt.p == U.p && U.p == W.p, "matrices not conformant"); + + CHK_MALLOC(eigenvals, real, U.p); + + sqmatrix_eigensolve(U, eigenvals, W); + + { + int i; + + /* Compute W = diag(sqrt(eigenvals)) * U; i.e. the rows of W + become the rows of U times sqrt(corresponding eigenvalue) */ + for (i = 0; i < U.p; ++i) { + CHECK(eigenvals[i] > 0, "non-positive eigenvalue"); + + blasglue_copy(U.p, U.data + i * U.p, 1, W.data + i * U.p, 1); + blasglue_rscal(U.p, sqrt(eigenvals[i]), W.data + i * U.p, 1); + } + } + + free(eigenvals); + + /* compute Usqrt = Ut * W */ + sqmatrix_AeBC(Usqrt, U, 1, W, 0); } diff --git a/src/matrixio/evectmatrixio.c b/src/matrixio/evectmatrixio.c index 2dc435c7..d9223dcc 100644 --- a/src/matrixio/evectmatrixio.c +++ b/src/matrixio/evectmatrixio.c @@ -26,44 +26,42 @@ #include "matrixio.h" -void evectmatrixio_writeall_raw(const char *filename, evectmatrix a) -{ - int dims[4], start[4] = {0, 0, 0, 0}; - const int rank = 4; - matrixio_id file_id, data_id; - - dims[0] = a.N; - dims[1] = a.c; - dims[2] = a.p; - dims[3] = SCALAR_NUMVALS; +void evectmatrixio_writeall_raw(const char *filename, evectmatrix a) { + int dims[4], start[4] = {0, 0, 0, 0}; + const int rank = 4; + matrixio_id file_id, data_id; - start[0] = a.Nstart; + dims[0] = a.N; + dims[1] = a.c; + dims[2] = a.p; + dims[3] = SCALAR_NUMVALS; - file_id = matrixio_create(filename); - data_id = matrixio_create_dataset(file_id, "rawdata", NULL, rank, dims); - - dims[0] = a.localN; - matrixio_write_real_data(data_id, dims, start, 1, (real *) a.data); + start[0] = a.Nstart; - matrixio_close_dataset(data_id); - matrixio_close(file_id); + file_id = matrixio_create(filename); + data_id = matrixio_create_dataset(file_id, "rawdata", NULL, rank, dims); + + dims[0] = a.localN; + matrixio_write_real_data(data_id, dims, start, 1, (real *)a.data); + + matrixio_close_dataset(data_id); + matrixio_close(file_id); } -void evectmatrixio_readall_raw(const char *filename, evectmatrix a) -{ - int rank = 4, dims[4]; - matrixio_id file_id; +void evectmatrixio_readall_raw(const char *filename, evectmatrix a) { + int rank = 4, dims[4]; + matrixio_id file_id; + + dims[0] = a.N; + dims[1] = a.c; + dims[2] = a.p; + dims[3] = SCALAR_NUMVALS; - dims[0] = a.N; - dims[1] = a.c; - dims[2] = a.p; - dims[3] = SCALAR_NUMVALS; + file_id = matrixio_open(filename, 1); - file_id = matrixio_open(filename, 1); - - CHECK(matrixio_read_real_data(file_id, "rawdata", &rank, dims, - a.localN, a.Nstart, 1, (real *) a.data), - "error reading data set in file"); + CHECK(matrixio_read_real_data(file_id, "rawdata", &rank, dims, a.localN, a.Nstart, 1, + (real *)a.data), + "error reading data set in file"); - matrixio_close(file_id); + matrixio_close(file_id); } diff --git a/src/matrixio/fieldio.c b/src/matrixio/fieldio.c index 580ff47d..6d62980f 100644 --- a/src/matrixio/fieldio.c +++ b/src/matrixio/fieldio.c @@ -28,122 +28,104 @@ #include "matrixio.h" #define TWOPI 6.2831853071795864769252867665590057683943388 -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) -/* note that kvector here is given in the reciprocal basis +/* note that kvector here is given in the reciprocal basis ...data_id should be of length at 2*num_components */ -void fieldio_write_complex_field(scalar_complex *field, - int rank, - const int dims[3], - const int local_dims[3], - const int start[3], - int which_component, int num_components, - const real *kvector, - matrixio_id file_id, - int append, - matrixio_id data_id[]) -{ - int i, j, k, component, ri_part; - - rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; - - if (kvector) { - real s[3]; /* the step size between grid points dotted with k */ - scalar_complex *phasex, *phasey, *phasez; - - for (i = 0; i < 3; ++i) - s[i] = TWOPI * kvector[i] / dims[i]; - - /* cache exp(ikx) along each of the directions, for speed */ - CHK_MALLOC(phasex, scalar_complex, local_dims[0]); - CHK_MALLOC(phasey, scalar_complex, local_dims[1]); - CHK_MALLOC(phasez, scalar_complex, local_dims[2]); - for (i = 0; i < local_dims[0]; ++i) { - real phase = s[0] * (i + start[0]); - phasex[i].re = cos(phase); - phasex[i].im = sin(phase); - } - for (j = 0; j < local_dims[1]; ++j) { - real phase = s[1] * (j + start[1]); - phasey[j].re = cos(phase); - phasey[j].im = sin(phase); - } - for (k = 0; k < local_dims[2]; ++k) { - real phase = s[2] * (k + start[2]); - phasez[k].re = cos(phase); - phasez[k].im = sin(phase); - } - - /* Now, multiply field by exp(i k*r): */ - for (i = 0; i < local_dims[0]; ++i) { - scalar_complex px = phasex[i]; - - for (j = 0; j < local_dims[1]; ++j) { - scalar_complex py; - real re = phasey[j].re, im = phasey[j].im; - py.re = px.re * re - px.im * im; - py.im = px.re * im + px.im * re; - - for (k = 0; k < local_dims[2]; ++k) { - int ijk = ((i*local_dims[1] + j)*local_dims[2] + k)*3; - real p_re, p_im; - real re = phasez[k].re, im = phasez[k].im; - - p_re = py.re * re - py.im * im; - p_im = py.re * im + py.im * re; - - for (component = 0; component < 3; ++component) { - int ijkc = ijk + component; - re = field[ijkc].re; im = field[ijkc].im; - field[ijkc].re = re * p_re - im * p_im; - field[ijkc].im = im * p_re + re * p_im; - } - } - } - } - - free(phasez); - free(phasey); - free(phasex); - } - - /* write hyperslabs for each field component: */ - for (component = 0; component < num_components; ++component) - if (component == which_component || - which_component < 0) - for (ri_part = 0; ri_part < 2; ++ri_part) { - char name[] = "x.i"; - name[0] = (num_components == 1 ? 'c' : 'x') + component; - name[2] = ri_part ? 'i' : 'r'; - - if (!append) - data_id[component*2 + ri_part] = - matrixio_create_dataset(file_id, name, NULL, - rank, dims); - - matrixio_write_real_data( - data_id[component*2 + ri_part], local_dims, start, - 2 * num_components, - ri_part ? &field[component].im - : &field[component].re); - } +void fieldio_write_complex_field(scalar_complex *field, int rank, const int dims[3], + const int local_dims[3], const int start[3], int which_component, + int num_components, const real *kvector, matrixio_id file_id, + int append, matrixio_id data_id[]) { + int i, j, k, component, ri_part; + + rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; + + if (kvector) { + real s[3]; /* the step size between grid points dotted with k */ + scalar_complex *phasex, *phasey, *phasez; + + for (i = 0; i < 3; ++i) + s[i] = TWOPI * kvector[i] / dims[i]; + + /* cache exp(ikx) along each of the directions, for speed */ + CHK_MALLOC(phasex, scalar_complex, local_dims[0]); + CHK_MALLOC(phasey, scalar_complex, local_dims[1]); + CHK_MALLOC(phasez, scalar_complex, local_dims[2]); + for (i = 0; i < local_dims[0]; ++i) { + real phase = s[0] * (i + start[0]); + phasex[i].re = cos(phase); + phasex[i].im = sin(phase); + } + for (j = 0; j < local_dims[1]; ++j) { + real phase = s[1] * (j + start[1]); + phasey[j].re = cos(phase); + phasey[j].im = sin(phase); + } + for (k = 0; k < local_dims[2]; ++k) { + real phase = s[2] * (k + start[2]); + phasez[k].re = cos(phase); + phasez[k].im = sin(phase); + } + + /* Now, multiply field by exp(i k*r): */ + for (i = 0; i < local_dims[0]; ++i) { + scalar_complex px = phasex[i]; + + for (j = 0; j < local_dims[1]; ++j) { + scalar_complex py; + real re = phasey[j].re, im = phasey[j].im; + py.re = px.re * re - px.im * im; + py.im = px.re * im + px.im * re; + + for (k = 0; k < local_dims[2]; ++k) { + int ijk = ((i * local_dims[1] + j) * local_dims[2] + k) * 3; + real p_re, p_im; + real re = phasez[k].re, im = phasez[k].im; + + p_re = py.re * re - py.im * im; + p_im = py.re * im + py.im * re; + + for (component = 0; component < 3; ++component) { + int ijkc = ijk + component; + re = field[ijkc].re; + im = field[ijkc].im; + field[ijkc].re = re * p_re - im * p_im; + field[ijkc].im = im * p_re + re * p_im; + } + } + } + } + + free(phasez); + free(phasey); + free(phasex); + } + + /* write hyperslabs for each field component: */ + for (component = 0; component < num_components; ++component) + if (component == which_component || which_component < 0) + for (ri_part = 0; ri_part < 2; ++ri_part) { + char name[] = "x.i"; + name[0] = (num_components == 1 ? 'c' : 'x') + component; + name[2] = ri_part ? 'i' : 'r'; + + if (!append) + data_id[component * 2 + ri_part] = + matrixio_create_dataset(file_id, name, NULL, rank, dims); + + matrixio_write_real_data(data_id[component * 2 + ri_part], local_dims, start, + 2 * num_components, + ri_part ? &field[component].im : &field[component].re); + } } -void fieldio_write_real_vals(real *vals, - int rank, - const int dims[3], - const int local_dims[3], - const int start[3], - matrixio_id file_id, - int append, - const char *dataname, - matrixio_id *data_id) -{ - rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; - - if (!append || data_id->id < 0) - *data_id = matrixio_create_dataset(file_id, dataname, - NULL, rank,dims); - - matrixio_write_real_data(*data_id,local_dims,start,1,vals); +void fieldio_write_real_vals(real *vals, int rank, const int dims[3], const int local_dims[3], + const int start[3], matrixio_id file_id, int append, + const char *dataname, matrixio_id *data_id) { + rank = dims[2] == 1 ? (dims[1] == 1 ? 1 : 2) : 3; + + if (!append || data_id->id < 0) + *data_id = matrixio_create_dataset(file_id, dataname, NULL, rank, dims); + + matrixio_write_real_data(*data_id, local_dims, start, 1, vals); } diff --git a/src/matrixio/matrixio.c b/src/matrixio/matrixio.c index fb3e298e..20b7562d 100644 --- a/src/matrixio/matrixio.c +++ b/src/matrixio/matrixio.c @@ -46,16 +46,16 @@ this is the case. */ #ifdef HAVE_H5PSET_MPI /* old name for this routine */ -# define H5Pset_fapl_mpio H5Pset_mpi -# ifndef HAVE_H5PSET_FAPL_MPIO -# define HAVE_H5PSET_FAPL_MPIO 1 -# endif +#define H5Pset_fapl_mpio H5Pset_mpi +#ifndef HAVE_H5PSET_FAPL_MPIO +#define HAVE_H5PSET_FAPL_MPIO 1 +#endif #endif #ifdef HAVE_H5PSET_FAPL_MPIO -# define IF_EXCLUSIVE(yes,no) no +#define IF_EXCLUSIVE(yes, no) no #else -# define IF_EXCLUSIVE(yes,no) yes +#define IF_EXCLUSIVE(yes, no) yes #endif /*****************************************************************************/ @@ -64,14 +64,15 @@ can't be found, in addition to returning an error code. The following macro can be wrapped around code to temporarily suppress error messages. */ -#define SUPPRESS_HDF5_ERRORS(statements) { \ - H5E_auto_t xxxxx_err_func; \ - void *xxxxx_err_func_data; \ - H5Eget_auto(&xxxxx_err_func, &xxxxx_err_func_data); \ - H5Eset_auto(NULL, NULL); \ - { statements; } \ - H5Eset_auto(xxxxx_err_func, xxxxx_err_func_data); \ -} +#define SUPPRESS_HDF5_ERRORS(statements) \ + { \ + H5E_auto_t xxxxx_err_func; \ + void *xxxxx_err_func_data; \ + H5Eget_auto(&xxxxx_err_func, &xxxxx_err_func_data); \ + H5Eset_auto(NULL, NULL); \ + { statements; } \ + H5Eset_auto(xxxxx_err_func, xxxxx_err_func_data); \ + } /*****************************************************************************/ @@ -79,251 +80,232 @@ can *not* be attached to files, in which case we'll write/read it as an ordinary dataset. Ugh. */ -static void write_attr(matrixio_id id, matrixio_id_ type_id, - matrixio_id_ space_id, - const char *name, const void *val) -{ +static void write_attr(matrixio_id id, matrixio_id_ type_id, matrixio_id_ space_id, + const char *name, const void *val) { #if defined(HAVE_HDF5) - hid_t attr_id; + hid_t attr_id; #ifndef HAVE_H5PSET_FAPL_MPIO - if (!mpi_is_master() && id.parallel) - return; /* only one process should add attributes */ + if (!mpi_is_master() && id.parallel) return; /* only one process should add attributes */ #else - /* otherwise, the operations must be performed collectively */ -#endif - - if (H5I_FILE == H5Iget_type(id.id)) { - attr_id = H5Dcreate(id.id, name, type_id, space_id, H5P_DEFAULT); - CHECK(attr_id >= 0, "error creating HDF attr"); - H5Dwrite(attr_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, val); - H5Dclose(attr_id); - } - else { - attr_id = H5Acreate(id.id, name, type_id, space_id, H5P_DEFAULT); - CHECK(attr_id >= 0, "error creating HDF attr"); - H5Awrite(attr_id, type_id, val); - H5Aclose(attr_id); - } + /* otherwise, the operations must be performed collectively */ +#endif + + if (H5I_FILE == H5Iget_type(id.id)) { + attr_id = H5Dcreate(id.id, name, type_id, space_id, H5P_DEFAULT); + CHECK(attr_id >= 0, "error creating HDF attr"); + H5Dwrite(attr_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, val); + H5Dclose(attr_id); + } + else { + attr_id = H5Acreate(id.id, name, type_id, space_id, H5P_DEFAULT); + CHECK(attr_id >= 0, "error creating HDF attr"); + H5Awrite(attr_id, type_id, val); + H5Aclose(attr_id); + } #endif } -static matrixio_id open_attr(matrixio_id id, matrixio_id_ *type_id, - matrixio_id_ *space_id, const char *name) -{ - matrixio_id attr_id; - attr_id.parallel = id.parallel; - attr_id.id = -1; +static matrixio_id open_attr(matrixio_id id, matrixio_id_ *type_id, matrixio_id_ *space_id, + const char *name) { + matrixio_id attr_id; + attr_id.parallel = id.parallel; + attr_id.id = -1; #if defined(HAVE_HDF5) - if (H5I_FILE == H5Iget_type(id.id)) { - SUPPRESS_HDF5_ERRORS(attr_id.id = H5Dopen(id.id, name)); - if (attr_id.id >= 0) { - *type_id = H5Dget_type(attr_id.id); - *space_id = H5Dget_space(attr_id.id); - } - } - else { - SUPPRESS_HDF5_ERRORS(attr_id.id = H5Aopen_name(id.id, name)); - if (attr_id.id >= 0) { - *type_id = H5Aget_type(attr_id.id); - *space_id = H5Aget_space(attr_id.id); - } - } - -#endif - return attr_id; + if (H5I_FILE == H5Iget_type(id.id)) { + SUPPRESS_HDF5_ERRORS(attr_id.id = H5Dopen(id.id, name)); + if (attr_id.id >= 0) { + *type_id = H5Dget_type(attr_id.id); + *space_id = H5Dget_space(attr_id.id); + } + } + else { + SUPPRESS_HDF5_ERRORS(attr_id.id = H5Aopen_name(id.id, name)); + if (attr_id.id >= 0) { + *type_id = H5Aget_type(attr_id.id); + *space_id = H5Aget_space(attr_id.id); + } + } + +#endif + return attr_id; } -static void read_attr(matrixio_id id, matrixio_id attr_id, - matrixio_id_ mem_type_id, void *val) -{ +static void read_attr(matrixio_id id, matrixio_id attr_id, matrixio_id_ mem_type_id, void *val) { #if defined(HAVE_HDF5) - if (H5I_FILE == H5Iget_type(id.id)) { - H5Dread(attr_id.id, mem_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, val); - } - else { - H5Aread(attr_id.id, mem_type_id, val); - } + if (H5I_FILE == H5Iget_type(id.id)) { + H5Dread(attr_id.id, mem_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, val); + } + else { + H5Aread(attr_id.id, mem_type_id, val); + } #endif } -static void close_attr(matrixio_id id, matrixio_id attr_id) -{ +static void close_attr(matrixio_id id, matrixio_id attr_id) { #if defined(HAVE_HDF5) - if (H5I_FILE == H5Iget_type(id.id)) { - H5Dclose(attr_id.id); - } - else { - H5Aclose(attr_id.id); - } + if (H5I_FILE == H5Iget_type(id.id)) { H5Dclose(attr_id.id); } + else { + H5Aclose(attr_id.id); + } #endif } /*****************************************************************************/ -void matrixio_write_string_attr(matrixio_id id, const char *name, - const char *val) -{ +void matrixio_write_string_attr(matrixio_id id, const char *name, const char *val) { #if defined(HAVE_HDF5) - hid_t type_id; - hid_t space_id; - - if (!val || !name || !name[0] || !val[0]) - return; /* don't try to create empty attributes */ - - type_id = H5Tcopy(H5T_C_S1); - H5Tset_size(type_id, strlen(val) + 1); - space_id = H5Screate(H5S_SCALAR); - write_attr(id, type_id, space_id, name, val); - H5Sclose(space_id); - H5Tclose(type_id); + hid_t type_id; + hid_t space_id; + + if (!val || !name || !name[0] || !val[0]) return; /* don't try to create empty attributes */ + + type_id = H5Tcopy(H5T_C_S1); + H5Tset_size(type_id, strlen(val) + 1); + space_id = H5Screate(H5S_SCALAR); + write_attr(id, type_id, space_id, name, val); + H5Sclose(space_id); + H5Tclose(type_id); #endif } -void matrixio_write_data_attr(matrixio_id id, const char *name, - const real *val, int rank, const int *dims) -{ +void matrixio_write_data_attr(matrixio_id id, const char *name, const real *val, int rank, + const int *dims) { #if defined(HAVE_HDF5) - hid_t type_id; - hid_t space_id; - hsize_t *space_dims; - int i; - - if (!val || !name || !name[0] || rank < 0 || !dims) - return; /* don't try to create empty attributes */ - + hid_t type_id; + hid_t space_id; + hsize_t *space_dims; + int i; + + if (!val || !name || !name[0] || rank < 0 || !dims) + return; /* don't try to create empty attributes */ + #if defined(SCALAR_SINGLE_PREC) - type_id = H5T_NATIVE_FLOAT; + type_id = H5T_NATIVE_FLOAT; #elif defined(SCALAR_LONG_DOUBLE_PREC) - type_id = H5T_NATIVE_LDOUBLE; + type_id = H5T_NATIVE_LDOUBLE; #else - type_id = H5T_NATIVE_DOUBLE; + type_id = H5T_NATIVE_DOUBLE; #endif - if (rank > 0) { - CHK_MALLOC(space_dims, hsize_t, rank); - for (i = 0; i < rank; ++i) - space_dims[i] = dims[i]; - space_id = H5Screate_simple(rank, space_dims, NULL); - free(space_dims); - } - else { - space_id = H5Screate(H5S_SCALAR); - } - - write_attr(id, type_id, space_id, name, val); - H5Sclose(space_id); + if (rank > 0) { + CHK_MALLOC(space_dims, hsize_t, rank); + for (i = 0; i < rank; ++i) + space_dims[i] = dims[i]; + space_id = H5Screate_simple(rank, space_dims, NULL); + free(space_dims); + } + else { + space_id = H5Screate(H5S_SCALAR); + } + + write_attr(id, type_id, space_id, name, val); + H5Sclose(space_id); #endif } -char *matrixio_read_string_attr(matrixio_id id, const char *name) -{ +char *matrixio_read_string_attr(matrixio_id id, const char *name) { #if defined(HAVE_HDF5) - matrixio_id attr_id; - hid_t type_id; - hid_t space_id; - int len; - char *s = NULL; - - if (!name || !name[0]) - return NULL; /* don't try to read empty-named attributes */ - - attr_id = open_attr(id, &type_id, &space_id, name); - if (attr_id.id < 0) - return NULL; - - if (H5Sget_simple_extent_npoints(space_id) == 1) { - len = H5Tget_size(type_id); - H5Tclose(type_id); - - type_id = H5Tcopy(H5T_C_S1); - H5Tset_size(type_id, len); - - CHK_MALLOC(s, char, len); - read_attr(id, attr_id, type_id, s); - } - - H5Tclose(type_id); - H5Sclose(space_id); - close_attr(id, attr_id); - - return s; + matrixio_id attr_id; + hid_t type_id; + hid_t space_id; + int len; + char *s = NULL; + + if (!name || !name[0]) return NULL; /* don't try to read empty-named attributes */ + + attr_id = open_attr(id, &type_id, &space_id, name); + if (attr_id.id < 0) return NULL; + + if (H5Sget_simple_extent_npoints(space_id) == 1) { + len = H5Tget_size(type_id); + H5Tclose(type_id); + + type_id = H5Tcopy(H5T_C_S1); + H5Tset_size(type_id, len); + + CHK_MALLOC(s, char, len); + read_attr(id, attr_id, type_id, s); + } + + H5Tclose(type_id); + H5Sclose(space_id); + close_attr(id, attr_id); + + return s; #else - return NULL; + return NULL; #endif } -real *matrixio_read_data_attr(matrixio_id id, const char *name, - int *rank, int max_rank, int *dims) -{ +real *matrixio_read_data_attr(matrixio_id id, const char *name, int *rank, int max_rank, + int *dims) { #if defined(HAVE_HDF5) - matrixio_id attr_id; - hid_t type_id, mem_type_id, space_id; - real *d = NULL; + matrixio_id attr_id; + hid_t type_id, mem_type_id, space_id; + real *d = NULL; + + if (!name || !name[0] || max_rank < 0 || !dims) + return NULL; /* don't try to create empty attributes */ - if (!name || !name[0] || max_rank < 0 || !dims) - return NULL; /* don't try to create empty attributes */ - #if defined(SCALAR_SINGLE_PREC) - mem_type_id = H5T_NATIVE_FLOAT; + mem_type_id = H5T_NATIVE_FLOAT; #elif defined(SCALAR_LONG_DOUBLE_PREC) - mem_type_id = H5T_NATIVE_LDOUBLE; + mem_type_id = H5T_NATIVE_LDOUBLE; #else - mem_type_id = H5T_NATIVE_DOUBLE; -#endif - - attr_id = open_attr(id, &type_id, &space_id, name); - if (attr_id.id < 0) - return NULL; - - *rank = H5Sget_simple_extent_ndims(space_id); - if (*rank <= max_rank) { - if (*rank > 0) { - int i; - hsize_t *space_dims, *maxdims; - CHK_MALLOC(space_dims, hsize_t, *rank); - CHK_MALLOC(maxdims, hsize_t, *rank); - H5Sget_simple_extent_dims(space_id, space_dims, maxdims); - for (i = 0; i < *rank; ++i) - dims[i] = space_dims[i]; - free(maxdims); - free(space_dims); - } - CHK_MALLOC(d, real, H5Sget_simple_extent_npoints(space_id)); - read_attr(id, attr_id, mem_type_id, d); - } - - H5Tclose(type_id); - H5Sclose(space_id); - close_attr(id, attr_id); - - return d; + mem_type_id = H5T_NATIVE_DOUBLE; +#endif + + attr_id = open_attr(id, &type_id, &space_id, name); + if (attr_id.id < 0) return NULL; + + *rank = H5Sget_simple_extent_ndims(space_id); + if (*rank <= max_rank) { + if (*rank > 0) { + int i; + hsize_t *space_dims, *maxdims; + CHK_MALLOC(space_dims, hsize_t, *rank); + CHK_MALLOC(maxdims, hsize_t, *rank); + H5Sget_simple_extent_dims(space_id, space_dims, maxdims); + for (i = 0; i < *rank; ++i) + dims[i] = space_dims[i]; + free(maxdims); + free(space_dims); + } + CHK_MALLOC(d, real, H5Sget_simple_extent_npoints(space_id)); + read_attr(id, attr_id, mem_type_id, d); + } + + H5Tclose(type_id); + H5Sclose(space_id); + close_attr(id, attr_id); + + return d; #else - return NULL; + return NULL; #endif } /*****************************************************************************/ -#define FNAME_SUFFIX ".h5" /* standard HDF5 filename suffix */ +#define FNAME_SUFFIX ".h5" /* standard HDF5 filename suffix */ -static char *add_fname_suffix(const char *fname) -{ - int oldlen = strlen(fname); - int suflen = strlen(FNAME_SUFFIX); - char *new_fname; +static char *add_fname_suffix(const char *fname) { + int oldlen = strlen(fname); + int suflen = strlen(FNAME_SUFFIX); + char *new_fname; - CHECK(fname, "null filename!"); + CHECK(fname, "null filename!"); - CHK_MALLOC(new_fname, char, oldlen + suflen + 1); + CHK_MALLOC(new_fname, char, oldlen + suflen + 1); - strcpy(new_fname, fname); + strcpy(new_fname, fname); - /* only add suffix if it is not already there: */ - if (strstr(new_fname, FNAME_SUFFIX) != new_fname + oldlen - suflen) - strcat(new_fname, FNAME_SUFFIX); + /* only add suffix if it is not already there: */ + if (strstr(new_fname, FNAME_SUFFIX) != new_fname + oldlen - suflen) + strcat(new_fname, FNAME_SUFFIX); - return new_fname; + return new_fname; } /*****************************************************************************/ @@ -332,431 +314,396 @@ static char *add_fname_suffix(const char *fname) static int matrixio_critical_section_tag = 0; #endif -static matrixio_id matrixio_create_(const char *fname, int parallel) -{ +static matrixio_id matrixio_create_(const char *fname, int parallel) { #if defined(HAVE_HDF5) - char *new_fname; - matrixio_id id; - hid_t access_props; - - access_props = H5Pcreate (H5P_FILE_ACCESS); - -# if defined(HAVE_MPI) && defined(HAVE_H5PSET_FAPL_MPIO) - if (parallel) - CHECK(H5Pset_fapl_mpio(access_props, mpb_comm, MPI_INFO_NULL) - >= 0, "error initializing MPI file access"); -# endif - - new_fname = add_fname_suffix(fname); - -# ifdef HAVE_H5PSET_FAPL_MPIO - id.id = H5Fcreate(new_fname, H5F_ACC_TRUNC, H5P_DEFAULT, access_props); -# else - if (parallel) mpi_begin_critical_section(matrixio_critical_section_tag); - if (mpi_is_master() || !parallel) - id.id = H5Fcreate(new_fname, H5F_ACC_TRUNC,H5P_DEFAULT,access_props); - else - id.id = H5Fopen(new_fname, H5F_ACC_RDWR, access_props); -# endif - id.parallel = parallel; - - CHECK(id.id >= 0, "error creating HDF output file"); - - free(new_fname); - - H5Pclose(access_props); - - return id; + char *new_fname; + matrixio_id id; + hid_t access_props; + + access_props = H5Pcreate(H5P_FILE_ACCESS); + +#if defined(HAVE_MPI) && defined(HAVE_H5PSET_FAPL_MPIO) + if (parallel) + CHECK(H5Pset_fapl_mpio(access_props, mpb_comm, MPI_INFO_NULL) >= 0, + "error initializing MPI file access"); +#endif + + new_fname = add_fname_suffix(fname); + +#ifdef HAVE_H5PSET_FAPL_MPIO + id.id = H5Fcreate(new_fname, H5F_ACC_TRUNC, H5P_DEFAULT, access_props); #else - mpi_one_fprintf(stderr, - "matrixio: cannot output \"%s\" (compiled without HDF)\n", - fname); - { - matrixio_id id = {0,0}; - return id; - } + if (parallel) mpi_begin_critical_section(matrixio_critical_section_tag); + if (mpi_is_master() || !parallel) + id.id = H5Fcreate(new_fname, H5F_ACC_TRUNC, H5P_DEFAULT, access_props); + else + id.id = H5Fopen(new_fname, H5F_ACC_RDWR, access_props); #endif -} + id.parallel = parallel; -matrixio_id matrixio_create(const char *fname) { - return matrixio_create_(fname, 1); -} + CHECK(id.id >= 0, "error creating HDF output file"); -matrixio_id matrixio_create_serial(const char *fname) { - return matrixio_create_(fname, 0); + free(new_fname); + + H5Pclose(access_props); + + return id; +#else + mpi_one_fprintf(stderr, "matrixio: cannot output \"%s\" (compiled without HDF)\n", fname); + { + matrixio_id id = {0, 0}; + return id; + } +#endif } -static matrixio_id matrixio_open_(const char *fname, int read_only, int parallel) -{ +matrixio_id matrixio_create(const char *fname) { return matrixio_create_(fname, 1); } + +matrixio_id matrixio_create_serial(const char *fname) { return matrixio_create_(fname, 0); } + +static matrixio_id matrixio_open_(const char *fname, int read_only, int parallel) { #if defined(HAVE_HDF5) - char *new_fname; - matrixio_id id; - hid_t access_props; + char *new_fname; + matrixio_id id; + hid_t access_props; - access_props = H5Pcreate (H5P_FILE_ACCESS); - -# if defined(HAVE_MPI) && defined(HAVE_H5PSET_FAPL_MPIO) - if (parallel) - H5Pset_fapl_mpio(access_props, mpb_comm, MPI_INFO_NULL); -# endif + access_props = H5Pcreate(H5P_FILE_ACCESS); - new_fname = add_fname_suffix(fname); +#if defined(HAVE_MPI) && defined(HAVE_H5PSET_FAPL_MPIO) + if (parallel) H5Pset_fapl_mpio(access_props, mpb_comm, MPI_INFO_NULL); +#endif - IF_EXCLUSIVE(if (parallel) mpi_begin_critical_section(matrixio_critical_section_tag),0); + new_fname = add_fname_suffix(fname); - if (read_only) - id.id = H5Fopen(new_fname, H5F_ACC_RDONLY, access_props); - else - id.id = H5Fopen(new_fname, H5F_ACC_RDWR, access_props); - id.parallel = parallel; - CHECK(id.id >= 0, "error opening HDF input file"); + IF_EXCLUSIVE(if (parallel) mpi_begin_critical_section(matrixio_critical_section_tag), 0); - free(new_fname); + if (read_only) + id.id = H5Fopen(new_fname, H5F_ACC_RDONLY, access_props); + else + id.id = H5Fopen(new_fname, H5F_ACC_RDWR, access_props); + id.parallel = parallel; + CHECK(id.id >= 0, "error opening HDF input file"); - H5Pclose(access_props); + free(new_fname); - return id; + H5Pclose(access_props); + + return id; #else - CHECK(0, "no matrixio implementation is linked"); - { - matrixio_id id = {0,0}; - return id; - } + CHECK(0, "no matrixio implementation is linked"); + { + matrixio_id id = {0, 0}; + return id; + } #endif } -void matrixio_close(matrixio_id id) -{ +void matrixio_close(matrixio_id id) { #if defined(HAVE_HDF5) - CHECK(H5Fclose(id.id) >= 0, "error closing HDF file"); - IF_EXCLUSIVE(if (id.parallel) mpi_end_critical_section(matrixio_critical_section_tag++),0); + CHECK(H5Fclose(id.id) >= 0, "error closing HDF file"); + IF_EXCLUSIVE(if (id.parallel) mpi_end_critical_section(matrixio_critical_section_tag++), 0); #endif } matrixio_id matrixio_open(const char *fname, int read_only) { - return matrixio_open_(fname, read_only, 1); + return matrixio_open_(fname, read_only, 1); } matrixio_id matrixio_open_serial(const char *fname, int read_only) { - return matrixio_open_(fname, read_only, 0); + return matrixio_open_(fname, read_only, 0); } /*****************************************************************************/ -matrixio_id matrixio_create_sub(matrixio_id id, - const char *name, const char *description) -{ - matrixio_id sub_id; - sub_id.id = 0; - sub_id.parallel = id.parallel; +matrixio_id matrixio_create_sub(matrixio_id id, const char *name, const char *description) { + matrixio_id sub_id; + sub_id.id = 0; + sub_id.parallel = id.parallel; #if defined(HAVE_HDF5) -# ifdef HAVE_H5PSET_FAPL_MPIO /* H5Gcreate is collective */ - sub_id.id = H5Gcreate(id.id, name, 0 /* ==> default size */ ); - matrixio_write_string_attr(sub_id, "description", description); -# else - /* when running a parallel job, only the master process creates the - group. It flushes the group to disk and then the other processes - open the group. Is this the right thing to do, or is the - H5Gcreate function parallel-aware? */ - - if (mpi_is_master() || !id.parallel) { - sub_id.id = H5Gcreate(id.id, name, 0 /* ==> default size */ ); - matrixio_write_string_attr(sub_id, "description", description); - - H5Fflush(sub_id.id, H5F_SCOPE_GLOBAL); - - IF_EXCLUSIVE(0,if (id.parallel) MPI_Barrier(mpb_comm)); - } - else { - IF_EXCLUSIVE(0,if (id.parallel) MPI_Barrier(mpb_comm)); - - sub_id.id = H5Gopen(id.id, name); - } -# endif -#endif - return sub_id; +#ifdef HAVE_H5PSET_FAPL_MPIO /* H5Gcreate is collective */ + sub_id.id = H5Gcreate(id.id, name, 0 /* ==> default size */); + matrixio_write_string_attr(sub_id, "description", description); +#else + /* when running a parallel job, only the master process creates the + group. It flushes the group to disk and then the other processes + open the group. Is this the right thing to do, or is the + H5Gcreate function parallel-aware? */ + + if (mpi_is_master() || !id.parallel) { + sub_id.id = H5Gcreate(id.id, name, 0 /* ==> default size */); + matrixio_write_string_attr(sub_id, "description", description); + + H5Fflush(sub_id.id, H5F_SCOPE_GLOBAL); + + IF_EXCLUSIVE(0, if (id.parallel) MPI_Barrier(mpb_comm)); + } + else { + IF_EXCLUSIVE(0, if (id.parallel) MPI_Barrier(mpb_comm)); + + sub_id.id = H5Gopen(id.id, name); + } +#endif +#endif + return sub_id; } -void matrixio_close_sub(matrixio_id id) -{ +void matrixio_close_sub(matrixio_id id) { #if defined(HAVE_HDF5) - CHECK(H5Gclose(id.id) >= 0, "error closing HDF group"); + CHECK(H5Gclose(id.id) >= 0, "error closing HDF group"); #endif } /*****************************************************************************/ -matrixio_id matrixio_open_dataset(matrixio_id id, - const char *name, - int rank, const int *dims) -{ - matrixio_id data_id; - data_id.id = 0; - data_id.parallel = id.parallel; +matrixio_id matrixio_open_dataset(matrixio_id id, const char *name, int rank, const int *dims) { + matrixio_id data_id; + data_id.id = 0; + data_id.parallel = id.parallel; #if defined(HAVE_HDF5) - { - int i, rank_copy; - hid_t space_id; - hsize_t *dims_copy, *maxdims; - - CHECK((data_id.id = H5Dopen(id.id, name)) >= 0, "error in H5Dopen"); - - CHECK((space_id = H5Dget_space(data_id.id)) >= 0, - "error in H5Dget_space"); - - rank_copy = H5Sget_simple_extent_ndims(space_id); - CHECK(rank == rank_copy, "rank in HDF5 file doesn't match expected rank"); - - CHK_MALLOC(dims_copy, hsize_t, rank); - CHK_MALLOC(maxdims, hsize_t, rank); - H5Sget_simple_extent_dims(space_id, dims_copy, maxdims); - free(maxdims); - for (i = 0; i < rank; ++i) { - CHECK(dims_copy[i] == dims[i], - "array size in HDF5 file doesn't match expected size"); - } - free(dims_copy); - - H5Sclose(space_id); - } -#endif - return data_id; + { + int i, rank_copy; + hid_t space_id; + hsize_t *dims_copy, *maxdims; + + CHECK((data_id.id = H5Dopen(id.id, name)) >= 0, "error in H5Dopen"); + + CHECK((space_id = H5Dget_space(data_id.id)) >= 0, "error in H5Dget_space"); + + rank_copy = H5Sget_simple_extent_ndims(space_id); + CHECK(rank == rank_copy, "rank in HDF5 file doesn't match expected rank"); + + CHK_MALLOC(dims_copy, hsize_t, rank); + CHK_MALLOC(maxdims, hsize_t, rank); + H5Sget_simple_extent_dims(space_id, dims_copy, maxdims); + free(maxdims); + for (i = 0; i < rank; ++i) { + CHECK(dims_copy[i] == dims[i], "array size in HDF5 file doesn't match expected size"); + } + free(dims_copy); + + H5Sclose(space_id); + } +#endif + return data_id; } /*****************************************************************************/ -matrixio_id matrixio_create_dataset(matrixio_id id, - const char *name, const char *description, - int rank, const int *dims) -{ - matrixio_id data_id; - data_id.id = 0; - data_id.parallel = id.parallel; +matrixio_id matrixio_create_dataset(matrixio_id id, const char *name, const char *description, + int rank, const int *dims) { + matrixio_id data_id; + data_id.id = 0; + data_id.parallel = id.parallel; #if defined(HAVE_HDF5) - { - int i; - hid_t space_id, type_id; - hsize_t *dims_copy; - - /* delete pre-existing datasets, or we'll have an error; I think - we can only do this on the master process. (?) */ - if (matrixio_dataset_exists(id, name)) { -# ifdef HAVE_H5PSET_FAPL_MPIO /* H5Gunlink is collective */ - matrixio_dataset_delete(id, name); -# else - if (mpi_is_master() || !id.parallel) { - matrixio_dataset_delete(id, name); - H5Fflush(id.id, H5F_SCOPE_GLOBAL); - } - IF_EXCLUSIVE(0,if (id.parallel) MPI_Barrier(mpb_comm)); -# endif - } - - CHECK(rank > 0, "non-positive rank"); - - CHK_MALLOC(dims_copy, hsize_t, rank); - for (i = 0; i < rank; ++i) - dims_copy[i] = dims[i]; - - space_id = H5Screate_simple(rank, dims_copy, NULL); - - free(dims_copy); + { + int i; + hid_t space_id, type_id; + hsize_t *dims_copy; + + /* delete pre-existing datasets, or we'll have an error; I think + we can only do this on the master process. (?) */ + if (matrixio_dataset_exists(id, name)) { +#ifdef HAVE_H5PSET_FAPL_MPIO /* H5Gunlink is collective */ + matrixio_dataset_delete(id, name); +#else + if (mpi_is_master() || !id.parallel) { + matrixio_dataset_delete(id, name); + H5Fflush(id.id, H5F_SCOPE_GLOBAL); + } + IF_EXCLUSIVE(0, if (id.parallel) MPI_Barrier(mpb_comm)); +#endif + } + + CHECK(rank > 0, "non-positive rank"); + + CHK_MALLOC(dims_copy, hsize_t, rank); + for (i = 0; i < rank; ++i) + dims_copy[i] = dims[i]; + + space_id = H5Screate_simple(rank, dims_copy, NULL); + + free(dims_copy); #if defined(SCALAR_SINGLE_PREC) - type_id = H5T_NATIVE_FLOAT; + type_id = H5T_NATIVE_FLOAT; #elif defined(SCALAR_LONG_DOUBLE_PREC) - type_id = H5T_NATIVE_LDOUBLE; + type_id = H5T_NATIVE_LDOUBLE; #else - type_id = H5T_NATIVE_DOUBLE; -#endif - - /* Create the dataset. Note that, on parallel machines, H5Dcreate - should do the right thing; it is supposedly a collective operation. */ - IF_EXCLUSIVE( - if (mpi_is_master() || !id.parallel) - data_id.id = H5Dcreate(id.id,name,type_id,space_id,H5P_DEFAULT); - else - data_id.id = H5Dopen(id.id, name), - data_id.id = H5Dcreate(id.id, name, type_id, space_id, H5P_DEFAULT)); - - H5Sclose(space_id); /* the dataset should have its own copy now */ - - matrixio_write_string_attr(data_id, "description", description); - } -#endif - return data_id; + type_id = H5T_NATIVE_DOUBLE; +#endif + + /* Create the dataset. Note that, on parallel machines, H5Dcreate + should do the right thing; it is supposedly a collective operation. */ + IF_EXCLUSIVE(if (mpi_is_master() || !id.parallel) data_id.id = + H5Dcreate(id.id, name, type_id, space_id, H5P_DEFAULT); + else data_id.id = H5Dopen(id.id, name), + data_id.id = H5Dcreate(id.id, name, type_id, space_id, H5P_DEFAULT)); + + H5Sclose(space_id); /* the dataset should have its own copy now */ + + matrixio_write_string_attr(data_id, "description", description); + } +#endif + return data_id; } -void matrixio_close_dataset(matrixio_id data_id) -{ +void matrixio_close_dataset(matrixio_id data_id) { #if defined(HAVE_HDF5) - CHECK(H5Dclose(data_id.id) >= 0, "error closing HDF dataset"); + CHECK(H5Dclose(data_id.id) >= 0, "error closing HDF dataset"); #endif } -int matrixio_dataset_exists(matrixio_id id, const char *name) -{ +int matrixio_dataset_exists(matrixio_id id, const char *name) { #if defined(HAVE_HDF5) - hid_t data_id; - SUPPRESS_HDF5_ERRORS(data_id = H5Dopen(id.id, name)); - if (data_id >= 0) - H5Dclose(data_id); - return (data_id >= 0); + hid_t data_id; + SUPPRESS_HDF5_ERRORS(data_id = H5Dopen(id.id, name)); + if (data_id >= 0) H5Dclose(data_id); + return (data_id >= 0); #else - return 0; + return 0; #endif } -void matrixio_dataset_delete(matrixio_id id, const char *name) -{ +void matrixio_dataset_delete(matrixio_id id, const char *name) { #if defined(HAVE_HDF5) - H5Gunlink(id.id, name); + H5Gunlink(id.id, name); #endif } /*****************************************************************************/ -void matrixio_write_real_data(matrixio_id data_id, - const int *local_dims, const int *local_start, - int stride, - real *data) -{ +void matrixio_write_real_data(matrixio_id data_id, const int *local_dims, const int *local_start, + int stride, real *data) { #if defined(HAVE_HDF5) - int rank; - hsize_t *dims, *maxdims; - hid_t space_id, type_id, mem_space_id; - start_t *start; - hsize_t *strides, *count, count_prod; - int i; - real *data_copy; - int data_copy_stride = 1, free_data_copy = 0, do_write = 1; + int rank; + hsize_t *dims, *maxdims; + hid_t space_id, type_id, mem_space_id; + start_t *start; + hsize_t *strides, *count, count_prod; + int i; + real *data_copy; + int data_copy_stride = 1, free_data_copy = 0, do_write = 1; + + /*******************************************************************/ + /* Get dimensions of dataset */ + + space_id = H5Dget_space(data_id.id); - /*******************************************************************/ - /* Get dimensions of dataset */ - - space_id = H5Dget_space(data_id.id); + rank = H5Sget_simple_extent_ndims(space_id); - rank = H5Sget_simple_extent_ndims(space_id); - - CHK_MALLOC(dims, hsize_t, rank); - CHK_MALLOC(maxdims, hsize_t, rank); + CHK_MALLOC(dims, hsize_t, rank); + CHK_MALLOC(maxdims, hsize_t, rank); - H5Sget_simple_extent_dims(space_id, dims, maxdims); + H5Sget_simple_extent_dims(space_id, dims, maxdims); - free(maxdims); + free(maxdims); #if defined(SCALAR_SINGLE_PREC) - type_id = H5T_NATIVE_FLOAT; + type_id = H5T_NATIVE_FLOAT; #elif defined(SCALAR_LONG_DOUBLE_PREC) - type_id = H5T_NATIVE_LDOUBLE; + type_id = H5T_NATIVE_LDOUBLE; #else - type_id = H5T_NATIVE_DOUBLE; -#endif - - /*******************************************************************/ - /* if stride > 1, make a contiguous copy; hdf5 is much faster - in this case. */ - - if (stride > 1) { - int N = 1; - for (i = 0; i < rank; ++i) - N *= local_dims[i]; - CHK_MALLOC(data_copy, real, N); - if (data_copy) { - free_data_copy = 1; - for (i = 0; i < (N & 3); ++i) - data_copy[i] = data[i * stride]; - for (; i < N; i += 4) { - real d0 = data[i * stride]; - real d1 = data[(i + 1) * stride]; - real d2 = data[(i + 2) * stride]; - real d3 = data[(i + 3) * stride]; - data_copy[i] = d0; - data_copy[i+1] = d1; - data_copy[i+2] = d2; - data_copy[i+3] = d3; - } - CHECK(i == N, "bug in matrixio copy routine"); - } - else { - data_copy = data; - data_copy_stride = stride; - } - } - else - data_copy = data; - - /*******************************************************************/ - /* Before we can write the data to the data set, we must define - the dimensions and "selections" of the arrays to be read & written: */ - - CHK_MALLOC(start, start_t, rank); - CHK_MALLOC(strides, hsize_t, rank); - CHK_MALLOC(count, hsize_t, rank); - - count_prod = 1; - for (i = 0; i < rank; ++i) { - start[i] = local_start[i]; - count[i] = local_dims[i]; - strides[i] = 1; - count_prod *= count[i]; - } - - if (count_prod > 0) { - H5Sselect_hyperslab(space_id, H5S_SELECT_SET, - start, NULL, count, NULL); - - for (i = 0; i < rank; ++i) - start[i] = 0; - strides[rank - 1] = data_copy_stride; - count[rank - 1] *= data_copy_stride; - mem_space_id = H5Screate_simple(rank, count, NULL); - count[rank - 1] = local_dims[rank - 1]; - H5Sselect_hyperslab(mem_space_id, H5S_SELECT_SET, - start, data_copy_stride <= 1 ? NULL : strides, - count, NULL); - } - else { /* this can happen on leftover processes in MPI */ - H5Sselect_none(space_id); - mem_space_id = H5Scopy(space_id); /* can't create an empty space */ - H5Sselect_none(mem_space_id); - do_write = 0; /* HDF5 complains about empty dataspaces otherwise */ - } - - /*******************************************************************/ - /* Write the data, then free all the stuff we've allocated. */ - - if (do_write) - H5Dwrite(data_id.id, type_id, mem_space_id, space_id, H5P_DEFAULT, - data_copy); - - if (free_data_copy) - free(data_copy); - H5Sclose(mem_space_id); - free(count); - free(strides); - free(start); - free(dims); - H5Sclose(space_id); + type_id = H5T_NATIVE_DOUBLE; +#endif + + /*******************************************************************/ + /* if stride > 1, make a contiguous copy; hdf5 is much faster + in this case. */ + + if (stride > 1) { + int N = 1; + for (i = 0; i < rank; ++i) + N *= local_dims[i]; + CHK_MALLOC(data_copy, real, N); + if (data_copy) { + free_data_copy = 1; + for (i = 0; i < (N & 3); ++i) + data_copy[i] = data[i * stride]; + for (; i < N; i += 4) { + real d0 = data[i * stride]; + real d1 = data[(i + 1) * stride]; + real d2 = data[(i + 2) * stride]; + real d3 = data[(i + 3) * stride]; + data_copy[i] = d0; + data_copy[i + 1] = d1; + data_copy[i + 2] = d2; + data_copy[i + 3] = d3; + } + CHECK(i == N, "bug in matrixio copy routine"); + } + else { + data_copy = data; + data_copy_stride = stride; + } + } + else + data_copy = data; + + /*******************************************************************/ + /* Before we can write the data to the data set, we must define + the dimensions and "selections" of the arrays to be read & written: */ + + CHK_MALLOC(start, start_t, rank); + CHK_MALLOC(strides, hsize_t, rank); + CHK_MALLOC(count, hsize_t, rank); + + count_prod = 1; + for (i = 0; i < rank; ++i) { + start[i] = local_start[i]; + count[i] = local_dims[i]; + strides[i] = 1; + count_prod *= count[i]; + } + + if (count_prod > 0) { + H5Sselect_hyperslab(space_id, H5S_SELECT_SET, start, NULL, count, NULL); + + for (i = 0; i < rank; ++i) + start[i] = 0; + strides[rank - 1] = data_copy_stride; + count[rank - 1] *= data_copy_stride; + mem_space_id = H5Screate_simple(rank, count, NULL); + count[rank - 1] = local_dims[rank - 1]; + H5Sselect_hyperslab(mem_space_id, H5S_SELECT_SET, start, data_copy_stride <= 1 ? NULL : strides, + count, NULL); + } + else { /* this can happen on leftover processes in MPI */ + H5Sselect_none(space_id); + mem_space_id = H5Scopy(space_id); /* can't create an empty space */ + H5Sselect_none(mem_space_id); + do_write = 0; /* HDF5 complains about empty dataspaces otherwise */ + } + + /*******************************************************************/ + /* Write the data, then free all the stuff we've allocated. */ + + if (do_write) H5Dwrite(data_id.id, type_id, mem_space_id, space_id, H5P_DEFAULT, data_copy); + + if (free_data_copy) free(data_copy); + H5Sclose(mem_space_id); + free(count); + free(strides); + free(start); + free(dims); + H5Sclose(space_id); #endif } #if defined(HAVE_HDF5) /* check if the given name is a dataset in group_id, and if so set d to point to a char** with a copy of name. */ -static herr_t find_dataset(hid_t group_id, const char *name, void *d) -{ - char **dname = (char **) d; - H5G_stat_t info; - - H5Gget_objinfo(group_id, name, 1, &info); - if (info.type == H5G_DATASET) { - CHK_MALLOC(*dname, char, strlen(name) + 1); - strcpy(*dname, name); - return 1; - } - return 0; +static herr_t find_dataset(hid_t group_id, const char *name, void *d) { + char **dname = (char **)d; + H5G_stat_t info; + + H5Gget_objinfo(group_id, name, 1, &info); + if (info.type == H5G_DATASET) { + CHK_MALLOC(*dname, char, strlen(name) + 1); + strcpy(*dname, name); + return 1; + } + return 0; } #endif @@ -778,141 +725,123 @@ static herr_t find_dataset(hid_t group_id, const char *name, void *d) are ignored here. Returns NULL if the dataset could not be found in id. */ -real *matrixio_read_real_data(matrixio_id id, - const char *name, - int *rank, int *dims, - int local_dim0, int local_dim0_start, - int stride, - real *data) -{ +real *matrixio_read_real_data(matrixio_id id, const char *name, int *rank, int *dims, + int local_dim0, int local_dim0_start, int stride, real *data) { #if defined(HAVE_HDF5) - hid_t space_id, type_id, data_id, mem_space_id; - hsize_t *dims_copy, *maxdims; - char *dname; - int i; - - CHECK(*rank > 0, "non-positive rank"); - - /*******************************************************************/ - /* Open the data set and check the dimensions: */ - - if (name) { - CHK_MALLOC(dname, char, strlen(name) + 1); - strcpy(dname, name); - } - else { - if (H5Giterate(id.id, "/", NULL, find_dataset, &dname) < 0) - return NULL; - } - SUPPRESS_HDF5_ERRORS(data_id = H5Dopen(id.id, dname)); - free(dname); - if (data_id < 0) - return NULL; - - CHECK((space_id = H5Dget_space(data_id)) >= 0, - "error in H5Dget_space"); - - { - int filerank = H5Sget_simple_extent_ndims(space_id); - - if (data) { - CHECK(*rank == filerank, - "rank in HDF5 file doesn't match expected rank"); - } - else { - CHECK(*rank >= filerank, - "rank in HDF5 file is too big"); - *rank = filerank; - } - } - - CHK_MALLOC(dims_copy, hsize_t, *rank); - CHK_MALLOC(maxdims, hsize_t, *rank); - - H5Sget_simple_extent_dims(space_id, dims_copy, maxdims); - free(maxdims); - - if (data) - for (i = 0; i < *rank; ++i) { - CHECK(dims_copy[i] == dims[i], - "array size in HDF5 file doesn't match expected size"); - } - else - for (i = 0; i < *rank; ++i) - dims[i] = dims_copy[i]; + hid_t space_id, type_id, data_id, mem_space_id; + hsize_t *dims_copy, *maxdims; + char *dname; + int i; + + CHECK(*rank > 0, "non-positive rank"); + + /*******************************************************************/ + /* Open the data set and check the dimensions: */ + + if (name) { + CHK_MALLOC(dname, char, strlen(name) + 1); + strcpy(dname, name); + } + else { + if (H5Giterate(id.id, "/", NULL, find_dataset, &dname) < 0) return NULL; + } + SUPPRESS_HDF5_ERRORS(data_id = H5Dopen(id.id, dname)); + free(dname); + if (data_id < 0) return NULL; + + CHECK((space_id = H5Dget_space(data_id)) >= 0, "error in H5Dget_space"); + + { + int filerank = H5Sget_simple_extent_ndims(space_id); + + if (data) { CHECK(*rank == filerank, "rank in HDF5 file doesn't match expected rank"); } + else { + CHECK(*rank >= filerank, "rank in HDF5 file is too big"); + *rank = filerank; + } + } + + CHK_MALLOC(dims_copy, hsize_t, *rank); + CHK_MALLOC(maxdims, hsize_t, *rank); + + H5Sget_simple_extent_dims(space_id, dims_copy, maxdims); + free(maxdims); + + if (data) + for (i = 0; i < *rank; ++i) { + CHECK(dims_copy[i] == dims[i], "array size in HDF5 file doesn't match expected size"); + } + else + for (i = 0; i < *rank; ++i) + dims[i] = dims_copy[i]; #if defined(SCALAR_SINGLE_PREC) - type_id = H5T_NATIVE_FLOAT; + type_id = H5T_NATIVE_FLOAT; #elif defined(SCALAR_LONG_DOUBLE_PREC) - type_id = H5T_NATIVE_LDOUBLE; + type_id = H5T_NATIVE_LDOUBLE; #else - type_id = H5T_NATIVE_DOUBLE; -#endif - - /*******************************************************************/ - /* Before we can read the data from the data set, we must define - the dimensions and "selections" of the arrays to be read & written: */ - - if (data) { - start_t *start; - hsize_t *strides, *count; - - CHK_MALLOC(start, start_t, *rank); - CHK_MALLOC(strides, hsize_t, *rank); - CHK_MALLOC(count, hsize_t, *rank); - - for (i = 0; i < *rank; ++i) { - start[i] = 0; - strides[i] = 1; - count[i] = dims[i]; - } - - dims_copy[0] = local_dim0; - dims_copy[*rank - 1] *= stride; - start[0] = 0; - strides[*rank - 1] = stride; - count[0] = local_dim0; - mem_space_id = H5Screate_simple(*rank, dims_copy, NULL); - H5Sselect_hyperslab(mem_space_id, H5S_SELECT_SET, - start, strides, count, NULL); - - start[0] = local_dim0_start; - count[0] = local_dim0; - H5Sselect_hyperslab(space_id, H5S_SELECT_SET, - start, NULL, count, NULL); - - free(count); - free(strides); - free(start); - } - else { - int N = 1; - for (i = 0; i < *rank; ++i) - N *= dims[i]; - CHK_MALLOC(data, real, N); - - mem_space_id = H5S_ALL; - H5Sclose(space_id); - space_id = H5S_ALL; - } - - /*******************************************************************/ - /* Read the data, then free all the H5 identifiers. */ - - CHECK(H5Dread(data_id, type_id, mem_space_id, space_id, H5P_DEFAULT, - data) >= 0, - "error reading HDF5 dataset"); - - if (mem_space_id != H5S_ALL) - H5Sclose(mem_space_id); - free(dims_copy); - if (space_id != H5S_ALL) - H5Sclose(space_id); - H5Dclose(data_id); - - return data; + type_id = H5T_NATIVE_DOUBLE; +#endif + + /*******************************************************************/ + /* Before we can read the data from the data set, we must define + the dimensions and "selections" of the arrays to be read & written: */ + + if (data) { + start_t *start; + hsize_t *strides, *count; + + CHK_MALLOC(start, start_t, *rank); + CHK_MALLOC(strides, hsize_t, *rank); + CHK_MALLOC(count, hsize_t, *rank); + + for (i = 0; i < *rank; ++i) { + start[i] = 0; + strides[i] = 1; + count[i] = dims[i]; + } + + dims_copy[0] = local_dim0; + dims_copy[*rank - 1] *= stride; + start[0] = 0; + strides[*rank - 1] = stride; + count[0] = local_dim0; + mem_space_id = H5Screate_simple(*rank, dims_copy, NULL); + H5Sselect_hyperslab(mem_space_id, H5S_SELECT_SET, start, strides, count, NULL); + + start[0] = local_dim0_start; + count[0] = local_dim0; + H5Sselect_hyperslab(space_id, H5S_SELECT_SET, start, NULL, count, NULL); + + free(count); + free(strides); + free(start); + } + else { + int N = 1; + for (i = 0; i < *rank; ++i) + N *= dims[i]; + CHK_MALLOC(data, real, N); + + mem_space_id = H5S_ALL; + H5Sclose(space_id); + space_id = H5S_ALL; + } + + /*******************************************************************/ + /* Read the data, then free all the H5 identifiers. */ + + CHECK(H5Dread(data_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data) >= 0, + "error reading HDF5 dataset"); + + if (mem_space_id != H5S_ALL) H5Sclose(mem_space_id); + free(dims_copy); + if (space_id != H5S_ALL) H5Sclose(space_id); + H5Dclose(data_id); + + return data; #else - CHECK(0, "no matrixio implementation is linked"); - return NULL; + CHECK(0, "no matrixio implementation is linked"); + return NULL; #endif } diff --git a/src/matrixio/matrixio.h b/src/matrixio/matrixio.h index 737a750d..cff553dc 100644 --- a/src/matrixio/matrixio.h +++ b/src/matrixio/matrixio.h @@ -22,24 +22,23 @@ #if defined(HAVE_HDF5) /* don't use new HDF5 1.8 API (which isn't even fully documented yet, grrr) */ -# define H5_USE_16_API 1 -# include +#define H5_USE_16_API 1 +#include typedef hid_t matrixio_id_; /* HDF5 changed this datatype in their interfaces starting in version 1.6.4 */ -# if H5_VERS_MAJOR > 1 \ - || (H5_VERS_MAJOR == 1 && H5_VERS_MINOR > 6) \ - || (H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 6 && H5_VERS_RELEASE > 3) +#if H5_VERS_MAJOR > 1 || (H5_VERS_MAJOR == 1 && H5_VERS_MINOR > 6) || \ + (H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 6 && H5_VERS_RELEASE > 3) typedef hsize_t start_t; -# else +#else typedef hssize_t start_t; -# endif +#endif #else /* no HDF */ typedef int matrixio_id_; /* dummy */ #endif typedef struct { - matrixio_id_ id; - int parallel; + matrixio_id_ id; + int parallel; } matrixio_id; extern matrixio_id matrixio_create(const char *fname); @@ -48,62 +47,40 @@ extern matrixio_id matrixio_open(const char *fname, int read_only); matrixio_id matrixio_open_serial(const char *fname, int read_only); extern void matrixio_close(matrixio_id id); -extern matrixio_id matrixio_create_sub(matrixio_id id, - const char *name, const char *description); +extern matrixio_id matrixio_create_sub(matrixio_id id, const char *name, const char *description); extern void matrixio_close_sub(matrixio_id id); -extern matrixio_id matrixio_open_dataset(matrixio_id id, - const char *name, - int rank, const int *dims); -extern matrixio_id matrixio_create_dataset(matrixio_id id, - const char *name, const char *description, - int rank, const int *dims); +extern matrixio_id matrixio_open_dataset(matrixio_id id, const char *name, int rank, + const int *dims); +extern matrixio_id matrixio_create_dataset(matrixio_id id, const char *name, + const char *description, int rank, const int *dims); extern void matrixio_close_dataset(matrixio_id data_id); extern int matrixio_dataset_exists(matrixio_id id, const char *name); extern void matrixio_dataset_delete(matrixio_id id, const char *name); -extern void matrixio_write_real_data(matrixio_id data_id, - const int *local_dims, const int *local_start, - int stride, - real *data); -extern real *matrixio_read_real_data(matrixio_id id, - const char *name, - int *rank, int *dims, - int local_dim0, int local_dim0_start, - int stride, - real *data); +extern void matrixio_write_real_data(matrixio_id data_id, const int *local_dims, + const int *local_start, int stride, real *data); +extern real *matrixio_read_real_data(matrixio_id id, const char *name, int *rank, int *dims, + int local_dim0, int local_dim0_start, int stride, real *data); -extern void matrixio_write_string_attr(matrixio_id id, const char *name, - const char *val); -extern void matrixio_write_data_attr(matrixio_id id, const char *name, - const real *val, int rank, - const int *dims); +extern void matrixio_write_string_attr(matrixio_id id, const char *name, const char *val); +extern void matrixio_write_data_attr(matrixio_id id, const char *name, const real *val, int rank, + const int *dims); extern char *matrixio_read_string_attr(matrixio_id id, const char *name); -extern real *matrixio_read_data_attr(matrixio_id id, const char *name, - int *rank, int max_rank, int *dims); +extern real *matrixio_read_data_attr(matrixio_id id, const char *name, int *rank, int max_rank, + int *dims); extern void evectmatrixio_writeall_raw(const char *filename, evectmatrix a); extern void evectmatrixio_readall_raw(const char *filename, evectmatrix a); -extern void fieldio_write_complex_field(scalar_complex *field, - int rank, - const int dims[3], - const int local_dims[3], - const int start[3], - int which_component, - int num_components, - const real kvector[3], - matrixio_id file_id, - int append, - matrixio_id data_id[]); -extern void fieldio_write_real_vals(real *vals, - int rank, - const int dims[3], - const int local_dims[3], - const int start[3], - matrixio_id file_id, - int append, - const char *dataname, - matrixio_id *data_id); +extern void fieldio_write_complex_field(scalar_complex *field, int rank, const int dims[3], + const int local_dims[3], const int start[3], + int which_component, int num_components, + const real kvector[3], matrixio_id file_id, int append, + matrixio_id data_id[]); +extern void fieldio_write_real_vals(real *vals, int rank, const int dims[3], + const int local_dims[3], const int start[3], + matrixio_id file_id, int append, const char *dataname, + matrixio_id *data_id); #endif /* MATRIXIO_H */ diff --git a/src/maxwell/imaxwell.h b/src/maxwell/imaxwell.h index fb1bdf63..f2db8ae2 100644 --- a/src/maxwell/imaxwell.h +++ b/src/maxwell/imaxwell.h @@ -25,70 +25,71 @@ #include "maxwell.h" #if defined(HAVE_LIBFFTW3) || defined(HAVE_LIBFFTW3F) || defined(HAVE_LIBFFTW3L) -# include -# ifdef HAVE_MPI -# include -# endif -# define HAVE_FFTW3 1 +#include +#ifdef HAVE_MPI +#include +#endif +#define HAVE_FFTW3 1 #elif defined(HAVE_LIBFFTW) -# include -# include -# ifdef HAVE_MPI -# include -# include -# endif +#include +#include +#ifdef HAVE_MPI +#include +#include +#endif #elif defined(HAVE_LIBDFFTW) -# include -# include -# ifdef HAVE_MPI -# include -# include -# endif +#include +#include +#ifdef HAVE_MPI +#include +#include +#endif #elif defined(HAVE_LIBSFFTW) -# include -# include -# ifdef HAVE_MPI -# include -# include -# endif +#include +#include +#ifdef HAVE_MPI +#include +#include +#endif #elif defined(HAVE_LIBXFFTW) -# include -# include -# ifdef HAVE_MPI -# include -# include -# endif +#include +#include +#ifdef HAVE_MPI +#include +#include +#endif #endif -#if defined(HAVE_LIBFFTW) || defined(HAVE_LIBDFFTW) || defined(HAVE_LIBSFFTW) || defined(HAVE_LIBXFFTW) -# define HAVE_FFTW 1 +#if defined(HAVE_LIBFFTW) || defined(HAVE_LIBDFFTW) || defined(HAVE_LIBSFFTW) || \ + defined(HAVE_LIBXFFTW) +#define HAVE_FFTW 1 #endif #include "scalar.h" #if defined(HAVE_FFTW3) -# if defined(SCALAR_SINGLE_PREC) -# define FFTW(x) fftwf_ ## x -# elif defined(SCALAR_LONG_DOUBLE_PREC) -# define FFTW(x) fftwl_ ## x -# else -# define FFTW(x) fftw_ ## x -# endif - typedef FFTW(plan) fftplan; +#if defined(SCALAR_SINGLE_PREC) +#define FFTW(x) fftwf_##x +#elif defined(SCALAR_LONG_DOUBLE_PREC) +#define FFTW(x) fftwl_##x +#else +#define FFTW(x) fftw_##x +#endif +typedef FFTW(plan) fftplan; #elif defined(HAVE_FFTW) -# ifdef HAVE_MPI -# ifdef SCALAR_COMPLEX - typedef fftwnd_mpi_plan fftplan; -# else - typedef rfftwnd_mpi_plan fftplan; -# endif -# else -# ifdef SCALAR_COMPLEX - typedef fftwnd_plan fftplan; -# else - typedef rfftwnd_plan fftplan; -# endif -# endif +#ifdef HAVE_MPI +#ifdef SCALAR_COMPLEX +typedef fftwnd_mpi_plan fftplan; +#else +typedef rfftwnd_mpi_plan fftplan; +#endif +#else +#ifdef SCALAR_COMPLEX +typedef fftwnd_plan fftplan; +#else +typedef rfftwnd_plan fftplan; +#endif +#endif #endif #endif /* IMAXWELL_H */ diff --git a/src/maxwell/maxwell.c b/src/maxwell/maxwell.c index 6ef8f057..25bb48e4 100644 --- a/src/maxwell/maxwell.c +++ b/src/maxwell/maxwell.c @@ -24,333 +24,306 @@ /* This file is has too many #ifdef's...blech. */ -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) - -maxwell_data *create_maxwell_data(int nx, int ny, int nz, - int *local_N, int *N_start, int *alloc_N, - int num_bands, - int max_fft_bands) -{ - int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; - maxwell_data *d = 0; - int fft_data_size; - - n[0] = nx; - n[1] = ny; - n[2] = nz; +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) + +maxwell_data *create_maxwell_data(int nx, int ny, int nz, int *local_N, int *N_start, int *alloc_N, + int num_bands, int max_fft_bands) { + int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; + maxwell_data *d = 0; + int fft_data_size; + + n[0] = nx; + n[1] = ny; + n[2] = nz; #if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3) -# error Non-FFTW FFTs are not currently supported. +#error Non-FFTW FFTs are not currently supported. #endif - #if defined(HAVE_FFTW) - CHECK(sizeof(fftw_real) == sizeof(real), - "floating-point type is inconsistent with FFTW!"); + CHECK(sizeof(fftw_real) == sizeof(real), "floating-point type is inconsistent with FFTW!"); #endif - CHK_MALLOC(d, maxwell_data, 1); + CHK_MALLOC(d, maxwell_data, 1); - d->nx = nx; - d->ny = ny; - d->nz = nz; + d->nx = nx; + d->ny = ny; + d->nz = nz; - d->max_fft_bands = MIN2(num_bands, max_fft_bands); - maxwell_set_num_bands(d, num_bands); + d->max_fft_bands = MIN2(num_bands, max_fft_bands); + maxwell_set_num_bands(d, num_bands); - d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0; - d->parity = NO_PARITY; + d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0; + d->parity = NO_PARITY; - d->last_dim_size = d->last_dim = n[rank - 1]; + d->last_dim_size = d->last_dim = n[rank - 1]; - /* ----------------------------------------------------- */ - d->nplans = 1; + /* ----------------------------------------------------- */ + d->nplans = 1; #ifndef HAVE_MPI - d->local_nx = nx; d->local_ny = ny; - d->local_x_start = d->local_y_start = 0; - *local_N = *alloc_N = nx * ny * nz; - *N_start = 0; - d->other_dims = *local_N / d->last_dim; - - d->fft_data = 0; /* initialize it here for use in specific planner? */ - -# if defined(HAVE_FFTW3) - d->nplans = 0; /* plans will be created as needed */ -# ifdef SCALAR_COMPLEX - d->fft_output_size = fft_data_size = nx * ny * nz; -# else - d->last_dim_size = 2 * (d->last_dim / 2 + 1); - d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; -# endif - -# elif defined(HAVE_FFTW) -# ifdef SCALAR_COMPLEX - d->fft_output_size = fft_data_size = nx * ny * nz; - d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD, - FFTW_ESTIMATE | FFTW_IN_PLACE, - (fftw_complex*) d->fft_data, - 3 * d->num_fft_bands, - (fftw_complex*) d->fft_data, - 3 * d->num_fft_bands); - d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD, - FFTW_ESTIMATE | FFTW_IN_PLACE, - (fftw_complex*) d->fft_data, - 3 * d->num_fft_bands, - (fftw_complex*) d->fft_data, - 3 * d->num_fft_bands); -# else /* not SCALAR_COMPLEX */ - d->last_dim_size = 2 * (d->last_dim / 2 + 1); - d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; - d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL, - FFTW_ESTIMATE | FFTW_IN_PLACE, - (fftw_real*) d->fft_data, - 3 * d->num_fft_bands, - (fftw_real*) d->fft_data, - 3 * d->num_fft_bands); - d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX, - FFTW_ESTIMATE | FFTW_IN_PLACE, - (fftw_real*) d->fft_data, - 3 * d->num_fft_bands, - (fftw_real*) d->fft_data, - 3 * d->num_fft_bands); -# endif /* not SCALAR_COMPLEX */ -# endif /* HAVE_FFTW */ + d->local_nx = nx; + d->local_ny = ny; + d->local_x_start = d->local_y_start = 0; + *local_N = *alloc_N = nx * ny * nz; + *N_start = 0; + d->other_dims = *local_N / d->last_dim; + + d->fft_data = 0; /* initialize it here for use in specific planner? */ + +#if defined(HAVE_FFTW3) + d->nplans = 0; /* plans will be created as needed */ +#ifdef SCALAR_COMPLEX + d->fft_output_size = fft_data_size = nx * ny * nz; +#else + d->last_dim_size = 2 * (d->last_dim / 2 + 1); + d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size) / 2; +#endif + +#elif defined(HAVE_FFTW) +#ifdef SCALAR_COMPLEX + d->fft_output_size = fft_data_size = nx * ny * nz; + d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, + (fftw_complex *)d->fft_data, 3 * d->num_fft_bands, + (fftw_complex *)d->fft_data, 3 * d->num_fft_bands); + d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, + (fftw_complex *)d->fft_data, 3 * d->num_fft_bands, + (fftw_complex *)d->fft_data, 3 * d->num_fft_bands); +#else /* not SCALAR_COMPLEX */ + d->last_dim_size = 2 * (d->last_dim / 2 + 1); + d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size) / 2; + d->plans[0] = rfftwnd_create_plan_specific( + rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real *)d->fft_data, + 3 * d->num_fft_bands, (fftw_real *)d->fft_data, 3 * d->num_fft_bands); + d->iplans[0] = rfftwnd_create_plan_specific( + rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real *)d->fft_data, + 3 * d->num_fft_bands, (fftw_real *)d->fft_data, 3 * d->num_fft_bands); +#endif /* not SCALAR_COMPLEX */ +#endif /* HAVE_FFTW */ #else /* HAVE_MPI */ - /* ----------------------------------------------------- */ + /* ----------------------------------------------------- */ -# if defined(HAVE_FFTW3) -{ - int i; - ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start; +#if defined(HAVE_FFTW3) + { + int i; + ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start; - CHECK(rank > 1, "rank < 2 MPI computations are not supported"); + CHECK(rank > 1, "rank < 2 MPI computations are not supported"); - d->nplans = 0; /* plans will be created as needed */ + d->nplans = 0; /* plans will be created as needed */ - for (i = 0; i < rank; ++i) np[i] = n[i]; + for (i = 0; i < rank; ++i) + np[i] = n[i]; -# ifndef SCALAR_COMPLEX - d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1); -# endif +#ifndef SCALAR_COMPLEX + d->last_dim_size = 2 * (np[rank - 1] = d->last_dim / 2 + 1); +#endif - fft_data_size = *alloc_N - = FFTW(mpi_local_size_transposed)(rank, np, mpb_comm, - &local_nx, &local_x_start, - &local_ny, &local_y_start); -# ifndef SCALAR_COMPLEX - fft_data_size = (*alloc_N *= 2); // convert to # of real scalars -# endif + fft_data_size = *alloc_N = FFTW(mpi_local_size_transposed)( + rank, np, mpb_comm, &local_nx, &local_x_start, &local_ny, &local_y_start); +#ifndef SCALAR_COMPLEX + fft_data_size = (*alloc_N *= 2); // convert to # of real scalars +#endif - d->local_nx = local_nx; - d->local_x_start = local_x_start; - d->local_ny = local_ny; - d->local_y_start = local_y_start; + d->local_nx = local_nx; + d->local_x_start = local_x_start; + d->local_ny = local_ny; + d->local_y_start = local_y_start; - d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz); - *local_N = d->local_nx * ny * nz; - *N_start = d->local_x_start * ny * nz; - d->other_dims = *local_N / d->last_dim; -} -# elif defined(HAVE_FFTW) + d->fft_output_size = nx * d->local_ny * (rank == 3 ? np[2] : nz); + *local_N = d->local_nx * ny * nz; + *N_start = d->local_x_start * ny * nz; + d->other_dims = *local_N / d->last_dim; + } +#elif defined(HAVE_FFTW) - CHECK(rank > 1, "rank < 2 MPI computations are not supported"); + CHECK(rank > 1, "rank < 2 MPI computations are not supported"); -# ifdef SCALAR_COMPLEX - d->iplans[0] = fftwnd_mpi_create_plan(mpb_comm, rank, n, - FFTW_FORWARD, - FFTW_ESTIMATE | FFTW_IN_PLACE); - { - int nt[3]; /* transposed dimensions for reverse FFT */ - nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; - d->plans[0] = fftwnd_mpi_create_plan(mpb_comm, rank, nt, - FFTW_BACKWARD, - FFTW_ESTIMATE | FFTW_IN_PLACE); - } +#ifdef SCALAR_COMPLEX + d->iplans[0] = + fftwnd_mpi_create_plan(mpb_comm, rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); + { + int nt[3]; /* transposed dimensions for reverse FFT */ + nt[0] = n[1]; + nt[1] = n[0]; + nt[2] = n[2]; + d->plans[0] = + fftwnd_mpi_create_plan(mpb_comm, rank, nt, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); + } - fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, - &d->local_ny, &d->local_y_start, - &fft_data_size); + fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, + &d->local_y_start, &fft_data_size); - d->fft_output_size = nx * d->local_ny * nz; + d->fft_output_size = nx * d->local_ny * nz; -# else /* not SCALAR_COMPLEX */ +#else /* not SCALAR_COMPLEX */ - CHECK(rank > 1, "rank < 2 MPI computations are not supported"); + CHECK(rank > 1, "rank < 2 MPI computations are not supported"); - d->iplans[0] = rfftwnd_mpi_create_plan(mpb_comm, rank, n, - FFTW_REAL_TO_COMPLEX, - FFTW_ESTIMATE | FFTW_IN_PLACE); + d->iplans[0] = rfftwnd_mpi_create_plan(mpb_comm, rank, n, FFTW_REAL_TO_COMPLEX, + FFTW_ESTIMATE | FFTW_IN_PLACE); - /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for - the reverse transform here--we always pass the dimensions of the - original real array, and rfftwnd_mpi assumes that if one - transform is transposed, then the other is as well. */ - d->plans[0] = rfftwnd_mpi_create_plan(mpb_comm, rank, n, - FFTW_COMPLEX_TO_REAL, - FFTW_ESTIMATE | FFTW_IN_PLACE); + /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for + the reverse transform here--we always pass the dimensions of the + original real array, and rfftwnd_mpi assumes that if one + transform is transposed, then the other is as well. */ + d->plans[0] = rfftwnd_mpi_create_plan(mpb_comm, rank, n, FFTW_COMPLEX_TO_REAL, + FFTW_ESTIMATE | FFTW_IN_PLACE); - rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, - &d->local_ny, &d->local_y_start, - &fft_data_size); + rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, + &d->local_y_start, &fft_data_size); - d->last_dim_size = 2 * (d->last_dim / 2 + 1); - if (rank == 2) - d->fft_output_size = nx * d->local_ny * nz; - else - d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2); + d->last_dim_size = 2 * (d->last_dim / 2 + 1); + if (rank == 2) + d->fft_output_size = nx * d->local_ny * nz; + else + d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2); -# endif /* not SCALAR_COMPLEX */ +#endif /* not SCALAR_COMPLEX */ - *local_N = d->local_nx * ny * nz; - *N_start = d->local_x_start * ny * nz; - *alloc_N = *local_N; - d->other_dims = *local_N / d->last_dim; + *local_N = d->local_nx * ny * nz; + *N_start = d->local_x_start * ny * nz; + *alloc_N = *local_N; + d->other_dims = *local_N / d->last_dim; -# endif /* HAVE_FFTW */ +#endif /* HAVE_FFTW */ #endif /* HAVE_MPI */ - /* ----------------------------------------------------- */ + /* ----------------------------------------------------- */ #ifdef HAVE_FFTW - CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed"); + CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed"); #endif - CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size); - d->mu_inv = NULL; + CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size); + d->mu_inv = NULL; - /* A scratch output array is required because the "ordinary" arrays - are not in a cartesian basis (or even a constant basis). */ - fft_data_size *= d->max_fft_bands; + /* A scratch output array is required because the "ordinary" arrays + are not in a cartesian basis (or even a constant basis). */ + fft_data_size *= d->max_fft_bands; #if defined(HAVE_FFTW3) - d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size); - CHECK(d->fft_data, "out of memory!"); - d->fft_data2 = d->fft_data; /* works in-place */ + d->fft_data = (scalar *)FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size); + CHECK(d->fft_data, "out of memory!"); + d->fft_data2 = d->fft_data; /* works in-place */ #else - CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size); - d->fft_data2 = d->fft_data; /* works in-place */ + CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size); + d->fft_data2 = d->fft_data; /* works in-place */ #endif - CHK_MALLOC(d->k_plus_G, k_data, *local_N); - CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N); + CHK_MALLOC(d->k_plus_G, k_data, *local_N); + CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N); - d->eps_inv_mean = 1.0; - d->mu_inv_mean = 1.0; + d->eps_inv_mean = 1.0; + d->mu_inv_mean = 1.0; - d->local_N = *local_N; - d->N_start = *N_start; - d->alloc_N = *alloc_N; - d->N = nx * ny * nz; + d->local_N = *local_N; + d->N_start = *N_start; + d->alloc_N = *alloc_N; + d->N = nx * ny * nz; - return d; + return d; } -void destroy_maxwell_data(maxwell_data *d) -{ - if (d) { - int i; +void destroy_maxwell_data(maxwell_data *d) { + if (d) { + int i; - for (i = 0; i < d->nplans; ++i) { + for (i = 0; i < d->nplans; ++i) { #if defined(HAVE_FFTW3) - FFTW(destroy_plan)((fftplan) (d->plans[i])); - FFTW(destroy_plan)((fftplan) (d->iplans[i])); + FFTW(destroy_plan)((fftplan)(d->plans[i])); + FFTW(destroy_plan)((fftplan)(d->iplans[i])); #elif defined(HAVE_FFTW) -# ifdef HAVE_MPI -# ifdef SCALAR_COMPLEX - fftwnd_mpi_destroy_plan((fftplan) (d->plans[i])); - fftwnd_mpi_destroy_plan((fftplan) (d->iplans[i])); -# else /* not SCALAR_COMPLEX */ - rfftwnd_mpi_destroy_plan((fftplan) (d->plans[i])); - rfftwnd_mpi_destroy_plan((fftplan) (d->iplans[i])); -# endif /* not SCALAR_COMPLEX */ -# else /* not HAVE_MPI */ -# ifdef SCALAR_COMPLEX - fftwnd_destroy_plan((fftplan) (d->plans[i])); - fftwnd_destroy_plan((fftplan) (d->iplans[i])); -# else /* not SCALAR_COMPLEX */ - rfftwnd_destroy_plan((fftplan) (d->plans[i])); - rfftwnd_destroy_plan((fftplan) (d->iplans[i])); -# endif /* not SCALAR_COMPLEX */ -# endif /* not HAVE_MPI */ +#ifdef HAVE_MPI +#ifdef SCALAR_COMPLEX + fftwnd_mpi_destroy_plan((fftplan)(d->plans[i])); + fftwnd_mpi_destroy_plan((fftplan)(d->iplans[i])); +#else /* not SCALAR_COMPLEX */ + rfftwnd_mpi_destroy_plan((fftplan)(d->plans[i])); + rfftwnd_mpi_destroy_plan((fftplan)(d->iplans[i])); +#endif /* not SCALAR_COMPLEX */ +#else /* not HAVE_MPI */ +#ifdef SCALAR_COMPLEX + fftwnd_destroy_plan((fftplan)(d->plans[i])); + fftwnd_destroy_plan((fftplan)(d->iplans[i])); +#else /* not SCALAR_COMPLEX */ + rfftwnd_destroy_plan((fftplan)(d->plans[i])); + rfftwnd_destroy_plan((fftplan)(d->iplans[i])); +#endif /* not SCALAR_COMPLEX */ +#endif /* not HAVE_MPI */ #endif /* HAVE FFTW */ - } + } - free(d->eps_inv); - if (d->mu_inv) free(d->mu_inv); + free(d->eps_inv); + if (d->mu_inv) free(d->mu_inv); #if defined(HAVE_FFTW3) - FFTW(free)(d->fft_data); - if (d->fft_data2 != d->fft_data) - FFTW(free)(d->fft_data2); + FFTW(free)(d->fft_data); + if (d->fft_data2 != d->fft_data) FFTW(free)(d->fft_data2); #else - free(d->fft_data); + free(d->fft_data); #endif - free(d->k_plus_G); - free(d->k_plus_G_normsqr); + free(d->k_plus_G); + free(d->k_plus_G_normsqr); - free(d); - } + free(d); + } } -void maxwell_set_num_bands(maxwell_data *d, int num_bands) -{ - d->num_bands = num_bands; - d->num_fft_bands = MIN2(num_bands, d->max_fft_bands); +void maxwell_set_num_bands(maxwell_data *d, int num_bands) { + d->num_bands = num_bands; + d->num_fft_bands = MIN2(num_bands, d->max_fft_bands); } /* compute a = b x c */ -static void compute_cross(real *a0, real *a1, real *a2, - real b0, real b1, real b2, - real c0, real c1, real c2) -{ - *a0 = b1 * c2 - b2 * c1; - *a1 = b2 * c0 - b0 * c2; - *a2 = b0 * c1 - b1 * c0; +static void compute_cross(real *a0, real *a1, real *a2, real b0, real b1, real b2, real c0, real c1, + real c2) { + *a0 = b1 * c2 - b2 * c1; + *a1 = b2 * c0 - b0 * c2; + *a2 = b0 * c1 - b1 * c0; } // kdom is returned in Cartesian coordinates -void maxwell_dominant_planewave(maxwell_data *d, evectmatrix H, int band, double kdom[3]) -{ - double max_amp = 0; - int max_i = 0, i = 0; - - CHECK(d != NULL, "maxwell_data is NULL"); - CHECK(1 <= band && band <= H.p, "band out of range"); - - /* find biggest planewave component in band 'band' of H on current process: */ - for (i = 0; i < H.localN; ++i) { - double amp = SCALAR_NORMSQR(H.data[(i*2+0)*H.p + band-1]) + SCALAR_NORMSQR(H.data[(i*2+1)*H.p + band-1]); - if (amp > max_amp) { - max_amp = amp; - max_i = i; - } +void maxwell_dominant_planewave(maxwell_data *d, evectmatrix H, int band, double kdom[3]) { + double max_amp = 0; + int max_i = 0, i = 0; + + CHECK(d != NULL, "maxwell_data is NULL"); + CHECK(1 <= band && band <= H.p, "band out of range"); + + /* find biggest planewave component in band 'band' of H on current process: */ + for (i = 0; i < H.localN; ++i) { + double amp = SCALAR_NORMSQR(H.data[(i * 2 + 0) * H.p + band - 1]) + + SCALAR_NORMSQR(H.data[(i * 2 + 1) * H.p + band - 1]); + if (amp > max_amp) { + max_amp = amp; + max_i = i; } + } - k_data cur_k = d->k_plus_G[max_i]; + k_data cur_k = d->k_plus_G[max_i]; - /* set kdom to cur_k.kmag * (cross product of cur_k.m and cur_k.n) */ - compute_cross(&kdom[0], &kdom[1], &kdom[2], - cur_k.mx, cur_k.my, cur_k.mz, - cur_k.nx, cur_k.ny, cur_k.nz); - kdom[0] *= cur_k.kmag; - kdom[1] *= cur_k.kmag; - kdom[2] *= cur_k.kmag; + /* set kdom to cur_k.kmag * (cross product of cur_k.m and cur_k.n) */ + compute_cross(&kdom[0], &kdom[1], &kdom[2], cur_k.mx, cur_k.my, cur_k.mz, cur_k.nx, cur_k.ny, + cur_k.nz); + kdom[0] *= cur_k.kmag; + kdom[1] *= cur_k.kmag; + kdom[2] *= cur_k.kmag; #ifdef HAVE_MPI - struct { - double amp; - int rank; - } local_res, global_res; + struct { + double amp; + int rank; + } local_res, global_res; - /* find kdom of biggest max_amp across all processes */ - local_res.amp = max_amp; - local_res.rank = my_global_rank(); + /* find kdom of biggest max_amp across all processes */ + local_res.amp = max_amp; + local_res.rank = my_global_rank(); - global_res.amp = 0; - global_res.rank = 0; + global_res.amp = 0; + global_res.rank = 0; - MPI_Allreduce(&local_res, &global_res, 1, MPI_DOUBLE_INT, MPI_MAXLOC, mpb_comm); - MPI_Bcast(kdom, 3, MPI_DOUBLE, global_res.rank, mpb_comm); + MPI_Allreduce(&local_res, &global_res, 1, MPI_DOUBLE_INT, MPI_MAXLOC, mpb_comm); + MPI_Bcast(kdom, 3, MPI_DOUBLE, global_res.rank, mpb_comm); #endif } @@ -359,197 +332,184 @@ void maxwell_dominant_planewave(maxwell_data *d, evectmatrix H, int band, double and s and p are the amplitudes of the "s and p" polarizations relative to the plane normal to (k+G) x axis. (It is an error if k+G is parallel to the axis.) */ -void maxwell_set_planewave(maxwell_data *d, evectmatrix H, int band, - int g[3], scalar s, scalar p, - real axis[3]) -{ +void maxwell_set_planewave(maxwell_data *d, evectmatrix H, int band, int g[3], scalar s, scalar p, + real axis[3]) { #ifdef SCALAR_COMPLEX - int x, y, z, i; - - /* coordinate of the G vector — minus sign is - due to sign convention in FFTW transform */ - x = g[0] > 0 ? d->nx - g[0] : -g[0]; - y = g[1] > 0 ? d->ny - g[1] : -g[1]; - z = g[2] > 0 ? d->nz - g[2] : -g[2]; - CHECK(x >= 0 && y >= 0 && z >= 0 && - x < d->nx && y < d->ny && z < d->nz, - "invalid planewave order"); - - for (i = 0; i < H.localN; ++i) { - ASSIGN_ZERO(H.data[(i*2+0)*H.p + band-1]); - ASSIGN_ZERO(H.data[(i*2+1)*H.p + band-1]); - } - - if (x >= d->local_x_start && x < d->local_x_start + d->local_nx) { - real sx, sy, sz, px, py, pz, kx, ky, kz, len; - scalar Hx, Hy, Hz; - k_data k; - - i = ((x - d->local_x_start) * d->ny + y) * d->nz + z; - k = d->k_plus_G[i]; - - compute_cross(&kx, &ky, &kz, /* unit vector in direction of k+G */ - k.mx, k.my, k.mz, - k.nx, k.ny, k.nz); - compute_cross(&px, &py, &pz, /* direction of p axis for H field */ - kx, ky, kz, axis[0], axis[1], axis[2]); - len = sqrt(px*px + py*py + pz*pz); - CHECK(len > 0, "invalid planewave axis parallel to k+G"); - px /= len; py /= len; pz /= len; - compute_cross(&sx, &sy, &sz, /* direction of s axis for H field */ - kx, ky, kz, px, py, pz); - - ASSIGN_SCALAR(Hx, s.re*sx+p.re*px, s.im*sx+p.im*px); - ASSIGN_SCALAR(Hy, s.re*sy+p.re*py, s.im*sy+p.im*py); - ASSIGN_SCALAR(Hz, s.re*sz+p.re*pz, s.im*sz+p.im*pz); - ASSIGN_SCALAR(H.data[(i*2+0)*H.p + band-1], - Hx.re*k.mx+Hy.re*k.my+Hz.re*k.mz, - Hx.im*k.mx+Hy.im*k.my+Hz.im*k.mz); - ASSIGN_SCALAR(H.data[(i*2+1)*H.p + band-1], - Hx.re*k.nx+Hy.re*k.ny+Hz.re*k.nz, - Hx.im*k.nx+Hy.im*k.ny+Hz.im*k.nz); - } + int x, y, z, i; + + /* coordinate of the G vector — minus sign is + due to sign convention in FFTW transform */ + x = g[0] > 0 ? d->nx - g[0] : -g[0]; + y = g[1] > 0 ? d->ny - g[1] : -g[1]; + z = g[2] > 0 ? d->nz - g[2] : -g[2]; + CHECK(x >= 0 && y >= 0 && z >= 0 && x < d->nx && y < d->ny && z < d->nz, + "invalid planewave order"); + + for (i = 0; i < H.localN; ++i) { + ASSIGN_ZERO(H.data[(i * 2 + 0) * H.p + band - 1]); + ASSIGN_ZERO(H.data[(i * 2 + 1) * H.p + band - 1]); + } + + if (x >= d->local_x_start && x < d->local_x_start + d->local_nx) { + real sx, sy, sz, px, py, pz, kx, ky, kz, len; + scalar Hx, Hy, Hz; + k_data k; + + i = ((x - d->local_x_start) * d->ny + y) * d->nz + z; + k = d->k_plus_G[i]; + + compute_cross(&kx, &ky, &kz, /* unit vector in direction of k+G */ + k.mx, k.my, k.mz, k.nx, k.ny, k.nz); + compute_cross(&px, &py, &pz, /* direction of p axis for H field */ + kx, ky, kz, axis[0], axis[1], axis[2]); + len = sqrt(px * px + py * py + pz * pz); + CHECK(len > 0, "invalid planewave axis parallel to k+G"); + px /= len; + py /= len; + pz /= len; + compute_cross(&sx, &sy, &sz, /* direction of s axis for H field */ + kx, ky, kz, px, py, pz); + + ASSIGN_SCALAR(Hx, s.re * sx + p.re * px, s.im * sx + p.im * px); + ASSIGN_SCALAR(Hy, s.re * sy + p.re * py, s.im * sy + p.im * py); + ASSIGN_SCALAR(Hz, s.re * sz + p.re * pz, s.im * sz + p.im * pz); + ASSIGN_SCALAR(H.data[(i * 2 + 0) * H.p + band - 1], Hx.re * k.mx + Hy.re * k.my + Hz.re * k.mz, + Hx.im * k.mx + Hy.im * k.my + Hz.im * k.mz); + ASSIGN_SCALAR(H.data[(i * 2 + 1) * H.p + band - 1], Hx.re * k.nx + Hy.re * k.ny + Hz.re * k.nz, + Hx.im * k.nx + Hy.im * k.ny + Hz.im * k.nz); + } #else /* !SCALAR_COMPLEX */ - (void) d; (void) H; (void) band; (void) g; (void) s; (void) p; (void) axis; - CHECK(0, "mpbi does not support maxwell_set_planewave"); + (void)d; + (void)H; + (void)band; + (void)g; + (void)s; + (void)p; + (void)axis; + CHECK(0, "mpbi does not support maxwell_set_planewave"); #endif } /* Set the current k point for the Maxwell solver. k is given in the basis of the reciprocal lattice vectors, G1, G2, and G3. */ -void update_maxwell_data_k(maxwell_data *d, real k[3], - real G1[3], real G2[3], real G3[3]) -{ - int nx = d->nx, ny = d->ny, nz = d->nz; - int cx = MAX2(1,d->nx/2), cy = MAX2(1,d->ny/2), cz = MAX2(1,d->nz/2); - k_data *kpG = d->k_plus_G; - real *kpGn2 = d->k_plus_G_normsqr; - int x, y, z; - real kx, ky, kz; - - kx = G1[0]*k[0] + G2[0]*k[1] + G3[0]*k[2]; - ky = G1[1]*k[0] + G2[1]*k[1] + G3[1]*k[2]; - kz = G1[2]*k[0] + G2[2]*k[1] + G3[2]*k[2]; - - d->zero_k = kx == 0.0 && ky == 0.0 && kz == 0.0; - - d->current_k[0] = kx; - d->current_k[1] = ky; - d->current_k[2] = kz; - - /* make sure current parity is still valid: */ - set_maxwell_data_parity(d, d->parity); - - for (x = d->local_x_start; x < d->local_x_start + d->local_nx; ++x) { - int kxi = (x >= cx) ? (x - nx) : x; - for (y = 0; y < ny; ++y) { - int kyi = (y >= cy) ? (y - ny) : y; - for (z = 0; z < nz; ++z, kpG++, kpGn2++) { - int kzi = (z >= cz) ? (z - nz) : z; - real kpGx, kpGy, kpGz, a, b, c, leninv; - - /* Compute k+G (noting that G is negative because - of the choice of sign in the FFTW Fourier transform): */ - kpGx = kx - (G1[0]*kxi + G2[0]*kyi + G3[0]*kzi); - kpGy = ky - (G1[1]*kxi + G2[1]*kyi + G3[1]*kzi); - kpGz = kz - (G1[2]*kxi + G2[2]*kyi + G3[2]*kzi); - - a = kpGx*kpGx + kpGy*kpGy + kpGz*kpGz; - kpG->kmag = sqrt(a); - *kpGn2 = a; - - /* Now, compute the two normal vectors: */ - /* (Note that we choose them so that m has odd/even - parity in z/y, and n is even/odd in z/y.) */ - - if (a == 0) { - kpG->nx = 0.0; kpG->ny = 1.0; kpG->nz = 0.0; - kpG->mx = 0.0; kpG->my = 0.0; kpG->mz = 1.0; - } - else { - if (kpGx == 0.0 && kpGy == 0.0) { - /* put n in the y direction if k+G is in z: */ - kpG->nx = 0.0; - kpG->ny = 1.0; - kpG->nz = 0.0; - } - else { - /* otherwise, let n = z x (k+G), normalized: */ - compute_cross(&a, &b, &c, - 0.0, 0.0, 1.0, - kpGx, kpGy, kpGz); - leninv = 1.0 / sqrt(a*a + b*b + c*c); - kpG->nx = a * leninv; - kpG->ny = b * leninv; - kpG->nz = c * leninv; - } - - /* m = n x (k+G), normalized */ - compute_cross(&a, &b, &c, - kpG->nx, kpG->ny, kpG->nz, - kpGx, kpGy, kpGz); - leninv = 1.0 / sqrt(a*a + b*b + c*c); - kpG->mx = a * leninv; - kpG->my = b * leninv; - kpG->mz = c * leninv; - } +void update_maxwell_data_k(maxwell_data *d, real k[3], real G1[3], real G2[3], real G3[3]) { + int nx = d->nx, ny = d->ny, nz = d->nz; + int cx = MAX2(1, d->nx / 2), cy = MAX2(1, d->ny / 2), cz = MAX2(1, d->nz / 2); + k_data *kpG = d->k_plus_G; + real *kpGn2 = d->k_plus_G_normsqr; + int x, y, z; + real kx, ky, kz; + + kx = G1[0] * k[0] + G2[0] * k[1] + G3[0] * k[2]; + ky = G1[1] * k[0] + G2[1] * k[1] + G3[1] * k[2]; + kz = G1[2] * k[0] + G2[2] * k[1] + G3[2] * k[2]; + + d->zero_k = kx == 0.0 && ky == 0.0 && kz == 0.0; + + d->current_k[0] = kx; + d->current_k[1] = ky; + d->current_k[2] = kz; + + /* make sure current parity is still valid: */ + set_maxwell_data_parity(d, d->parity); + + for (x = d->local_x_start; x < d->local_x_start + d->local_nx; ++x) { + int kxi = (x >= cx) ? (x - nx) : x; + for (y = 0; y < ny; ++y) { + int kyi = (y >= cy) ? (y - ny) : y; + for (z = 0; z < nz; ++z, kpG++, kpGn2++) { + int kzi = (z >= cz) ? (z - nz) : z; + real kpGx, kpGy, kpGz, a, b, c, leninv; + + /* Compute k+G (noting that G is negative because + of the choice of sign in the FFTW Fourier transform): */ + kpGx = kx - (G1[0] * kxi + G2[0] * kyi + G3[0] * kzi); + kpGy = ky - (G1[1] * kxi + G2[1] * kyi + G3[1] * kzi); + kpGz = kz - (G1[2] * kxi + G2[2] * kyi + G3[2] * kzi); + + a = kpGx * kpGx + kpGy * kpGy + kpGz * kpGz; + kpG->kmag = sqrt(a); + *kpGn2 = a; + + /* Now, compute the two normal vectors: */ + /* (Note that we choose them so that m has odd/even + parity in z/y, and n is even/odd in z/y.) */ + + if (a == 0) { + kpG->nx = 0.0; + kpG->ny = 1.0; + kpG->nz = 0.0; + kpG->mx = 0.0; + kpG->my = 0.0; + kpG->mz = 1.0; + } + else { + if (kpGx == 0.0 && kpGy == 0.0) { + /* put n in the y direction if k+G is in z: */ + kpG->nx = 0.0; + kpG->ny = 1.0; + kpG->nz = 0.0; + } + else { + /* otherwise, let n = z x (k+G), normalized: */ + compute_cross(&a, &b, &c, 0.0, 0.0, 1.0, kpGx, kpGy, kpGz); + leninv = 1.0 / sqrt(a * a + b * b + c * c); + kpG->nx = a * leninv; + kpG->ny = b * leninv; + kpG->nz = c * leninv; + } + + /* m = n x (k+G), normalized */ + compute_cross(&a, &b, &c, kpG->nx, kpG->ny, kpG->nz, kpGx, kpGy, kpGz); + leninv = 1.0 / sqrt(a * a + b * b + c * c); + kpG->mx = a * leninv; + kpG->my = b * leninv; + kpG->mz = c * leninv; + } #ifdef DEBUG -#define DOT(u0,u1,u2,v0,v1,v2) ((u0)*(v0) + (u1)*(v1) + (u2)*(v2)) - - /* check orthogonality */ - CHECK(fabs(DOT(kpGx, kpGy, kpGz, - kpG->nx, kpG->ny, kpG->nz)) < 1e-6, - "vectors not orthogonal!"); - CHECK(fabs(DOT(kpGx, kpGy, kpGz, - kpG->mx, kpG->my, kpG->mz)) < 1e-6, - "vectors not orthogonal!"); - CHECK(fabs(DOT(kpG->mx, kpG->my, kpG->mz, - kpG->nx, kpG->ny, kpG->nz)) < 1e-6, - "vectors not orthogonal!"); - - /* check normalization */ - CHECK(fabs(DOT(kpG->nx, kpG->ny, kpG->nz, - kpG->nx, kpG->ny, kpG->nz) - 1.0) < 1e-6, - "vectors not unit vectors!"); - CHECK(fabs(DOT(kpG->mx, kpG->my, kpG->mz, - kpG->mx, kpG->my, kpG->mz) - 1.0) < 1e-6, - "vectors not unit vectors!"); +#define DOT(u0, u1, u2, v0, v1, v2) ((u0) * (v0) + (u1) * (v1) + (u2) * (v2)) + + /* check orthogonality */ + CHECK(fabs(DOT(kpGx, kpGy, kpGz, kpG->nx, kpG->ny, kpG->nz)) < 1e-6, + "vectors not orthogonal!"); + CHECK(fabs(DOT(kpGx, kpGy, kpGz, kpG->mx, kpG->my, kpG->mz)) < 1e-6, + "vectors not orthogonal!"); + CHECK(fabs(DOT(kpG->mx, kpG->my, kpG->mz, kpG->nx, kpG->ny, kpG->nz)) < 1e-6, + "vectors not orthogonal!"); + + /* check normalization */ + CHECK(fabs(DOT(kpG->nx, kpG->ny, kpG->nz, kpG->nx, kpG->ny, kpG->nz) - 1.0) < 1e-6, + "vectors not unit vectors!"); + CHECK(fabs(DOT(kpG->mx, kpG->my, kpG->mz, kpG->mx, kpG->my, kpG->mz) - 1.0) < 1e-6, + "vectors not unit vectors!"); #endif - } - } - } + } + } + } } -void set_maxwell_data_parity(maxwell_data *d, int parity) -{ - if ((parity & EVEN_Z_PARITY) && (parity & ODD_Z_PARITY)) - parity &= ~(EVEN_Z_PARITY | ODD_Z_PARITY); - if (d->current_k[2] != 0.0) - parity &= ~(EVEN_Z_PARITY | ODD_Z_PARITY); - if ((parity & EVEN_Y_PARITY) && (parity & ODD_Y_PARITY)) - parity &= ~(EVEN_Y_PARITY | ODD_Y_PARITY); - if (d->current_k[1] != 0.0) - parity &= ~(EVEN_Y_PARITY | ODD_Y_PARITY); - d->parity = parity; +void set_maxwell_data_parity(maxwell_data *d, int parity) { + if ((parity & EVEN_Z_PARITY) && (parity & ODD_Z_PARITY)) + parity &= ~(EVEN_Z_PARITY | ODD_Z_PARITY); + if (d->current_k[2] != 0.0) parity &= ~(EVEN_Z_PARITY | ODD_Z_PARITY); + if ((parity & EVEN_Y_PARITY) && (parity & ODD_Y_PARITY)) + parity &= ~(EVEN_Y_PARITY | ODD_Y_PARITY); + if (d->current_k[1] != 0.0) parity &= ~(EVEN_Y_PARITY | ODD_Y_PARITY); + d->parity = parity; } -maxwell_target_data *create_maxwell_target_data(maxwell_data *md, - real target_frequency) -{ - maxwell_target_data *d; +maxwell_target_data *create_maxwell_target_data(maxwell_data *md, real target_frequency) { + maxwell_target_data *d; - CHK_MALLOC(d, maxwell_target_data, 1); + CHK_MALLOC(d, maxwell_target_data, 1); - d->d = md; - d->target_frequency = target_frequency; + d->d = md; + d->target_frequency = target_frequency; - return d; + return d; } -void destroy_maxwell_target_data(maxwell_target_data *d) -{ - if (d) { - free(d); - } +void destroy_maxwell_target_data(maxwell_target_data *d) { + if (d) { free(d); } } diff --git a/src/maxwell/maxwell.h b/src/maxwell/maxwell.h index c3240e5b..b09b51f6 100644 --- a/src/maxwell/maxwell.h +++ b/src/maxwell/maxwell.h @@ -19,7 +19,7 @@ #define MAXWELL_H #ifdef MPB_REAL -# define real mpb_real +#define real mpb_real #endif #include "scalar.h" @@ -34,186 +34,146 @@ extern "C" { orthonormal vectors orthogonal to (kx,ky,kz). These are used as the basis for the H vector (to maintain transversality). */ typedef struct { - real kmag; - real mx, my, mz; - real nx, ny, nz; + real kmag; + real mx, my, mz; + real nx, ny, nz; } k_data; - /* Data structure to hold the upper triangle of a symmetric real matrix or possibly a Hermitian complex matrix (e.g. the dielectric tensor). */ typedef struct { #ifdef WITH_HERMITIAN_EPSILON - real m00, m11, m22; - scalar_complex m01, m02, m12; -# define ESCALAR_RE(z) CSCALAR_RE(z) -# define ESCALAR_IM(z) CSCALAR_IM(z) -# define ESCALAR_NORMSQR(z) CSCALAR_NORMSQR(z) -# define ASSIGN_ESCALAR(z, re, im) CASSIGN_SCALAR(z, re, im) -# define ESCALAR_MULT_CONJ_RE(a, b) CSCALAR_MULT_CONJ_RE(a, b) -# define ESCALAR_MULT_CONJ_IM(a, b) CSCALAR_MULT_CONJ_IM(a, b) + real m00, m11, m22; + scalar_complex m01, m02, m12; +#define ESCALAR_RE(z) CSCALAR_RE(z) +#define ESCALAR_IM(z) CSCALAR_IM(z) +#define ESCALAR_NORMSQR(z) CSCALAR_NORMSQR(z) +#define ASSIGN_ESCALAR(z, re, im) CASSIGN_SCALAR(z, re, im) +#define ESCALAR_MULT_CONJ_RE(a, b) CSCALAR_MULT_CONJ_RE(a, b) +#define ESCALAR_MULT_CONJ_IM(a, b) CSCALAR_MULT_CONJ_IM(a, b) #else - real m00, m01, m02, - m11, m12, - m22; -# define ESCALAR_RE(z) (z) -# define ESCALAR_IM(z) (0.0) -# define ESCALAR_NORMSQR(z) ((z) * (z)) -# define ASSIGN_ESCALAR(z, re, im) (z) = (re); -# define ESCALAR_MULT_CONJ_RE(a, b) ((a) * (b)) -# define ESCALAR_MULT_CONJ_IM(a, b) (0.0) + real m00, m01, m02, m11, m12, m22; +#define ESCALAR_RE(z) (z) +#define ESCALAR_IM(z) (0.0) +#define ESCALAR_NORMSQR(z) ((z) * (z)) +#define ASSIGN_ESCALAR(z, re, im) (z) = (re); +#define ESCALAR_MULT_CONJ_RE(a, b) ((a) * (b)) +#define ESCALAR_MULT_CONJ_IM(a, b) (0.0) #endif } symmetric_matrix; #ifdef WITH_HERMITIAN_EPSILON -# define DIAG_SYMMETRIC_MATRIX(m) ((m).m01.re == 0.0 && (m).m01.im == 0.0 && \ - (m).m02.re == 0.0 && (m).m02.im == 0.0 && \ - (m).m12.re == 0.0 && (m).m12.im == 0.0) +#define DIAG_SYMMETRIC_MATRIX(m) \ + ((m).m01.re == 0.0 && (m).m01.im == 0.0 && (m).m02.re == 0.0 && (m).m02.im == 0.0 && \ + (m).m12.re == 0.0 && (m).m12.im == 0.0) #else -# define DIAG_SYMMETRIC_MATRIX(m) ((m).m01 == 0.0 && \ - (m).m02 == 0.0 && \ - (m).m12 == 0.0) +#define DIAG_SYMMETRIC_MATRIX(m) ((m).m01 == 0.0 && (m).m02 == 0.0 && (m).m12 == 0.0) #endif #define NO_PARITY (0) -#define EVEN_Z_PARITY (1<<0) -#define ODD_Z_PARITY (1<<1) -#define EVEN_Y_PARITY (1<<2) -#define ODD_Y_PARITY (1<<3) +#define EVEN_Z_PARITY (1 << 0) +#define ODD_Z_PARITY (1 << 1) +#define EVEN_Y_PARITY (1 << 2) +#define ODD_Y_PARITY (1 << 3) #define MAX_NPLANS 32 typedef struct { - int nx, ny, nz; - int local_nx, local_ny; - int local_x_start, local_y_start; - int last_dim, last_dim_size, other_dims; + int nx, ny, nz; + int local_nx, local_ny; + int local_x_start, local_y_start; + int last_dim, last_dim_size, other_dims; - int num_bands; - int N, local_N, N_start, alloc_N; + int num_bands; + int N, local_N, N_start, alloc_N; - int fft_output_size; + int fft_output_size; - int max_fft_bands, num_fft_bands; + int max_fft_bands, num_fft_bands; - real current_k[3]; /* (in cartesian basis) */ - int parity; + real current_k[3]; /* (in cartesian basis) */ + int parity; - void *plans[MAX_NPLANS], *iplans[MAX_NPLANS]; - int nplans, plans_howmany[MAX_NPLANS], plans_stride[MAX_NPLANS], plans_dist[MAX_NPLANS]; + void *plans[MAX_NPLANS], *iplans[MAX_NPLANS]; + int nplans, plans_howmany[MAX_NPLANS], plans_stride[MAX_NPLANS], plans_dist[MAX_NPLANS]; - scalar *fft_data, *fft_data2; + scalar *fft_data, *fft_data2; - int zero_k; /* non-zero if k is zero (handled specially) */ - k_data *k_plus_G; - real *k_plus_G_normsqr; + int zero_k; /* non-zero if k is zero (handled specially) */ + k_data *k_plus_G; + real *k_plus_G_normsqr; - symmetric_matrix *eps_inv; - real eps_inv_mean; - symmetric_matrix *mu_inv; - real mu_inv_mean; + symmetric_matrix *eps_inv; + real eps_inv_mean; + symmetric_matrix *mu_inv; + real mu_inv_mean; } maxwell_data; -extern maxwell_data *create_maxwell_data(int nx, int ny, int nz, - int *local_N, int *N_start, - int *alloc_N, - int num_bands, - int num_fft_bands); +extern maxwell_data *create_maxwell_data(int nx, int ny, int nz, int *local_N, int *N_start, + int *alloc_N, int num_bands, int num_fft_bands); extern void destroy_maxwell_data(maxwell_data *d); extern void maxwell_set_num_bands(maxwell_data *d, int num_bands); extern void maxwell_dominant_planewave(maxwell_data *d, evectmatrix H, int band, double kdom[3]); -extern void maxwell_set_planewave(maxwell_data *d, evectmatrix H, int band, - int g[3], scalar s, scalar p, - real axis[3]); +extern void maxwell_set_planewave(maxwell_data *d, evectmatrix H, int band, int g[3], scalar s, + scalar p, real axis[3]); -extern void update_maxwell_data_k(maxwell_data *d, real k[3], - real G1[3], real G2[3], real G3[3]); +extern void update_maxwell_data_k(maxwell_data *d, real k[3], real G1[3], real G2[3], real G3[3]); extern void set_maxwell_data_parity(maxwell_data *d, int parity); -typedef void (*maxwell_dielectric_function) (symmetric_matrix *eps, - symmetric_matrix *eps_inv, - const real r[3], - void *epsilon_data); -typedef int (*maxwell_dielectric_mean_function) (symmetric_matrix *meps, - symmetric_matrix *meps_inv, - real n[3], - real d1, real d2, real d3, - real tol, - const real r[3], - void *epsilon_data); - -extern void set_maxwell_dielectric(maxwell_data *md, - const int mesh_size[3], - real R[3][3], real G[3][3], - maxwell_dielectric_function epsilon, - maxwell_dielectric_mean_function mepsilon, - void *epsilon_data); - -extern void set_maxwell_mu(maxwell_data *md, - const int mesh_size[3], - real R[3][3], real G[3][3], - maxwell_dielectric_function mu, - maxwell_dielectric_mean_function mmu, +typedef void (*maxwell_dielectric_function)(symmetric_matrix *eps, symmetric_matrix *eps_inv, + const real r[3], void *epsilon_data); +typedef int (*maxwell_dielectric_mean_function)(symmetric_matrix *meps, symmetric_matrix *meps_inv, + real n[3], real d1, real d2, real d3, real tol, + const real r[3], void *epsilon_data); + +extern void set_maxwell_dielectric(maxwell_data *md, const int mesh_size[3], real R[3][3], + real G[3][3], maxwell_dielectric_function epsilon, + maxwell_dielectric_mean_function mepsilon, void *epsilon_data); + +extern void set_maxwell_mu(maxwell_data *md, const int mesh_size[3], real R[3][3], real G[3][3], + maxwell_dielectric_function mu, maxwell_dielectric_mean_function mmu, void *mu_data); extern void maxwell_sym_matrix_eigs(real eigs[3], const symmetric_matrix *V); -extern void maxwell_sym_matrix_invert(symmetric_matrix *Vinv, - const symmetric_matrix *V); -extern void maxwell_sym_matrix_rotate(symmetric_matrix *RAR, - const symmetric_matrix *A_, - double R[3][3]); +extern void maxwell_sym_matrix_invert(symmetric_matrix *Vinv, const symmetric_matrix *V); +extern void maxwell_sym_matrix_rotate(symmetric_matrix *RAR, const symmetric_matrix *A_, + double R[3][3]); extern int maxwell_sym_matrix_positive_definite(symmetric_matrix *V); -extern void maxwell_compute_fft(int dir, maxwell_data *d, - scalar *array_in, scalar *array_out, - int howmany, int stride, int dist); -extern void maxwell_compute_d_from_H(maxwell_data *d, evectmatrix Xin, - scalar_complex *dfield, - int cur_band_start, int cur_num_bands); -extern void maxwell_compute_h_from_H(maxwell_data *d, evectmatrix Hin, - scalar_complex *hfield, - int cur_band_start, int cur_num_bands); -extern void maxwell_compute_H_from_B(maxwell_data *d, evectmatrix Bin, - evectmatrix Hout, scalar_complex *hfield, - int Bin_band_start, int Hout_band_start, - int cur_num_bands); -extern void maxwell_compute_e_from_d(maxwell_data *d, - scalar_complex *dfield, - int cur_num_bands); - -extern void maxwell_vectorfield_otherhalf(maxwell_data *d, - scalar_complex *field, - real phasex,real phasey,real phasez); -extern void maxwell_cscalarfield_otherhalf(maxwell_data *d, - scalar_complex *field, - real phasex, real phasey, - real phasez); +extern void maxwell_compute_fft(int dir, maxwell_data *d, scalar *array_in, scalar *array_out, + int howmany, int stride, int dist); +extern void maxwell_compute_d_from_H(maxwell_data *d, evectmatrix Xin, scalar_complex *dfield, + int cur_band_start, int cur_num_bands); +extern void maxwell_compute_h_from_H(maxwell_data *d, evectmatrix Hin, scalar_complex *hfield, + int cur_band_start, int cur_num_bands); +extern void maxwell_compute_H_from_B(maxwell_data *d, evectmatrix Bin, evectmatrix Hout, + scalar_complex *hfield, int Bin_band_start, + int Hout_band_start, int cur_num_bands); +extern void maxwell_compute_e_from_d(maxwell_data *d, scalar_complex *dfield, int cur_num_bands); + +extern void maxwell_vectorfield_otherhalf(maxwell_data *d, scalar_complex *field, real phasex, + real phasey, real phasez); +extern void maxwell_cscalarfield_otherhalf(maxwell_data *d, scalar_complex *field, real phasex, + real phasey, real phasez); extern void maxwell_scalarfield_otherhalf(maxwell_data *d, real *field); -void assign_symmatrix_vector(scalar_complex *newv, - const symmetric_matrix matrix, +void assign_symmatrix_vector(scalar_complex *newv, const symmetric_matrix matrix, const scalar_complex *oldv); extern void maxwell_operator(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work); + int is_current_eigenvector, evectmatrix Work); extern void maxwell_muinv_operator(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, evectmatrix Work); -extern void maxwell_simple_precondition(evectmatrix X, - void *data, real *eigenvals); -extern void maxwell_preconditioner(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY); -extern void maxwell_preconditioner2(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY); - -extern void maxwell_ucross_op(evectmatrix Xin, evectmatrix Xout, - maxwell_data *d, const real u[3]); +extern void maxwell_simple_precondition(evectmatrix X, void *data, real *eigenvals); +extern void maxwell_preconditioner(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY); +extern void maxwell_preconditioner2(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY); + +extern void maxwell_ucross_op(evectmatrix Xin, evectmatrix Xout, maxwell_data *d, const real u[3]); extern void maxwell_parity_constraint(evectmatrix X, void *data); extern void maxwell_zparity_constraint(evectmatrix X, void *data); @@ -227,41 +187,31 @@ extern double *maxwell_zparity(evectmatrix X, maxwell_data *d); extern double *maxwell_yparity(evectmatrix X, maxwell_data *d); typedef struct { - maxwell_data *d; - real target_frequency; + maxwell_data *d; + real target_frequency; } maxwell_target_data; -extern maxwell_target_data *create_maxwell_target_data(maxwell_data *d, - real target_frequency); +extern maxwell_target_data *create_maxwell_target_data(maxwell_data *d, real target_frequency); extern void destroy_maxwell_target_data(maxwell_target_data *d); -extern void maxwell_target_operator1(evectmatrix Xin, evectmatrix Xout, - void *data, - int is_current_eigenvector, - evectmatrix Work); -extern void maxwell_target_operator(evectmatrix Xin, evectmatrix Xout, - void *data, int is_current_eigenvector, - evectmatrix Work); -extern void maxwell_target_preconditioner(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY); -extern void maxwell_target_preconditioner2(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY); - -extern void spherical_quadrature_points(real *x, real *y, real *z, - real *weight, int num_sq_pts); - -extern int check_maxwell_dielectric(maxwell_data *d, - int negative_epsilon_okp); +extern void maxwell_target_operator1(evectmatrix Xin, evectmatrix Xout, void *data, + int is_current_eigenvector, evectmatrix Work); +extern void maxwell_target_operator(evectmatrix Xin, evectmatrix Xout, void *data, + int is_current_eigenvector, evectmatrix Work); +extern void maxwell_target_preconditioner(evectmatrix Xin, evectmatrix Xout, void *data, + evectmatrix Y, real *eigenvals, sqmatrix YtY); +extern void maxwell_target_preconditioner2(evectmatrix Xin, evectmatrix Xout, void *data, + evectmatrix Y, real *eigenvals, sqmatrix YtY); + +extern void spherical_quadrature_points(real *x, real *y, real *z, real *weight, int num_sq_pts); + +extern int check_maxwell_dielectric(maxwell_data *d, int negative_epsilon_okp); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif /* __cplusplus */ #ifdef MPB_REAL -# undef real +#undef real #endif #endif /* MAXWELL_H */ diff --git a/src/maxwell/maxwell_constraints.c b/src/maxwell/maxwell_constraints.c index d31eb0e3..709456dd 100644 --- a/src/maxwell/maxwell_constraints.c +++ b/src/maxwell/maxwell_constraints.c @@ -28,17 +28,14 @@ /**************************************************************************/ /* function to call z and y parity constraints, if necessary */ -void maxwell_parity_constraint(evectmatrix X, void *data) -{ - maxwell_data *d = (maxwell_data *) data; +void maxwell_parity_constraint(evectmatrix X, void *data) { + maxwell_data *d = (maxwell_data *)data; - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); - if (d->parity & (EVEN_Z_PARITY | ODD_Z_PARITY)) - maxwell_zparity_constraint(X, data); - if (d->parity & (EVEN_Y_PARITY | ODD_Y_PARITY)) - maxwell_yparity_constraint(X, data); + if (d->parity & (EVEN_Z_PARITY | ODD_Z_PARITY)) maxwell_zparity_constraint(X, data); + if (d->parity & (EVEN_Y_PARITY | ODD_Y_PARITY)) maxwell_yparity_constraint(X, data); } /**************************************************************************/ @@ -64,63 +61,58 @@ void maxwell_parity_constraint(evectmatrix X, void *data) for only one parity of states (the projection operator, like the mirror flip operator, commutes with the Maxwell operator, so this projection should not slow convergence). */ -void maxwell_zparity_constraint(evectmatrix X, void *data) -{ - maxwell_data *d = (maxwell_data *) data; - int i, j, b, nxy, nz; - int zparity = ((d->parity & EVEN_Z_PARITY) ? +1 : - ((d->parity & ODD_Z_PARITY) ? -1 : 0)); - - if (zparity == 0) - return; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - if (d->nz > 1) { - nxy = d->other_dims; - nz = d->last_dim; - } - else { /* common case (2d system): even/odd == TE/TM */ - nxy = d->other_dims * d->last_dim; - if (zparity == +1) - for (i = 0; i < nxy; ++i) - for (b = 0; b < X.p; ++b) { - ASSIGN_ZERO(X.data[(i * X.c + 1) * X.p + b]); - } - else if (zparity == -1) - for (i = 0; i < nxy; ++i) - for (b = 0; b < X.p; ++b) { - ASSIGN_ZERO(X.data[(i * X.c) * X.p + b]); - } - return; - } - - for (i = 0; i < nxy; ++i) { - for (j = 0; 2*j <= nz; ++j) { - int ij = i * nz + j; - int ij2 = i * nz + (j > 0 ? nz - j : 0); - for (b = 0; b < X.p; ++b) { - scalar u,v, u2,v2; - u = X.data[(ij * 2) * X.p + b]; - v = X.data[(ij * 2 + 1) * X.p + b]; - u2 = X.data[(ij2 * 2) * X.p + b]; - v2 = X.data[(ij2 * 2 + 1) * X.p + b]; - ASSIGN_SCALAR(X.data[(ij * 2) * X.p + b], - 0.5*(SCALAR_RE(u) + zparity*SCALAR_RE(u2)), - 0.5*(SCALAR_IM(u) + zparity*SCALAR_IM(u2))); - ASSIGN_SCALAR(X.data[(ij * 2 + 1) * X.p + b], - 0.5*(SCALAR_RE(v) - zparity*SCALAR_RE(v2)), - 0.5*(SCALAR_IM(v) - zparity*SCALAR_IM(v2))); - ASSIGN_SCALAR(X.data[(ij2 * 2) * X.p + b], - 0.5*(SCALAR_RE(u2) + zparity*SCALAR_RE(u)), - 0.5*(SCALAR_IM(u2) + zparity*SCALAR_IM(u))); - ASSIGN_SCALAR(X.data[(ij2 * 2 + 1) * X.p + b], - 0.5*(SCALAR_RE(v2) - zparity*SCALAR_RE(v)), - 0.5*(SCALAR_IM(v2) - zparity*SCALAR_IM(v))); - } - } - } +void maxwell_zparity_constraint(evectmatrix X, void *data) { + maxwell_data *d = (maxwell_data *)data; + int i, j, b, nxy, nz; + int zparity = ((d->parity & EVEN_Z_PARITY) ? +1 : ((d->parity & ODD_Z_PARITY) ? -1 : 0)); + + if (zparity == 0) return; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + if (d->nz > 1) { + nxy = d->other_dims; + nz = d->last_dim; + } + else { /* common case (2d system): even/odd == TE/TM */ + nxy = d->other_dims * d->last_dim; + if (zparity == +1) + for (i = 0; i < nxy; ++i) + for (b = 0; b < X.p; ++b) { + ASSIGN_ZERO(X.data[(i * X.c + 1) * X.p + b]); + } + else if (zparity == -1) + for (i = 0; i < nxy; ++i) + for (b = 0; b < X.p; ++b) { + ASSIGN_ZERO(X.data[(i * X.c) * X.p + b]); + } + return; + } + + for (i = 0; i < nxy; ++i) { + for (j = 0; 2 * j <= nz; ++j) { + int ij = i * nz + j; + int ij2 = i * nz + (j > 0 ? nz - j : 0); + for (b = 0; b < X.p; ++b) { + scalar u, v, u2, v2; + u = X.data[(ij * 2) * X.p + b]; + v = X.data[(ij * 2 + 1) * X.p + b]; + u2 = X.data[(ij2 * 2) * X.p + b]; + v2 = X.data[(ij2 * 2 + 1) * X.p + b]; + ASSIGN_SCALAR(X.data[(ij * 2) * X.p + b], 0.5 * (SCALAR_RE(u) + zparity * SCALAR_RE(u2)), + 0.5 * (SCALAR_IM(u) + zparity * SCALAR_IM(u2))); + ASSIGN_SCALAR(X.data[(ij * 2 + 1) * X.p + b], + 0.5 * (SCALAR_RE(v) - zparity * SCALAR_RE(v2)), + 0.5 * (SCALAR_IM(v) - zparity * SCALAR_IM(v2))); + ASSIGN_SCALAR(X.data[(ij2 * 2) * X.p + b], 0.5 * (SCALAR_RE(u2) + zparity * SCALAR_RE(u)), + 0.5 * (SCALAR_IM(u2) + zparity * SCALAR_IM(u))); + ASSIGN_SCALAR(X.data[(ij2 * 2 + 1) * X.p + b], + 0.5 * (SCALAR_RE(v2) - zparity * SCALAR_RE(v)), + 0.5 * (SCALAR_IM(v2) - zparity * SCALAR_IM(v))); + } + } + } } /* Compute the parity of all of the states in X, returning an array @@ -129,64 +121,57 @@ void maxwell_zparity_constraint(evectmatrix X, void *data) of the mirror flip operator, and will be +1/-1 for even/odd eigenstates and something in between for everything else. Assumes that the columns of X are normalized to 1. */ -double *maxwell_zparity(evectmatrix X, maxwell_data *d) -{ - int i, j, b, nxy, nz; - double *zparity, *zp_scratch, *norm_scratch; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - CHK_MALLOC(zparity, double, X.p); - CHK_MALLOC(zp_scratch, double, X.p); - for (b = 0; b < X.p; ++b) - zp_scratch[b] = 0.0; - CHK_MALLOC(norm_scratch, double, X.p); - for (b = 0; b < X.p; ++b) - norm_scratch[b] = 0.0; - - if (d->nz > 1) { - nxy = d->other_dims; - nz = d->last_dim; - } - else { - nxy = d->other_dims * d->last_dim; - nz = 1; - } - - for (i = 0; i < nxy; ++i) - for (j = 0; 2*j <= nz; ++j) { - int ij = i * nz + j; - int ij2 = i * nz + (j > 0 ? nz - j : 0); - for (b = 0; b < X.p; ++b) { - scalar u,v, u2,v2; - u = X.data[(ij * 2) * X.p + b]; - v = X.data[(ij * 2 + 1) * X.p + b]; - u2 = X.data[(ij2 * 2) * X.p + b]; - v2 = X.data[(ij2 * 2 + 1) * X.p + b]; - zp_scratch[b] += (ij == ij2 ? 1.0 : 2.0) * - (SCALAR_RE(u) * SCALAR_RE(u2) + - SCALAR_IM(u) * SCALAR_IM(u2) - - SCALAR_RE(v) * SCALAR_RE(v2) - - SCALAR_IM(v) * SCALAR_IM(v2)); - norm_scratch[b] += (ij == ij2 ? 1.0 : 2.0) * - (SCALAR_RE(u) * SCALAR_RE(u) + - SCALAR_IM(u) * SCALAR_IM(u) + - SCALAR_RE(v) * SCALAR_RE(v) + - SCALAR_IM(v) * SCALAR_IM(v)); - } - } - - mpi_allreduce(zp_scratch, zparity, X.p, - double, MPI_DOUBLE, MPI_SUM, mpb_comm); - mpi_allreduce(norm_scratch, zp_scratch, X.p, - double, MPI_DOUBLE, MPI_SUM, mpb_comm); - for (b = 0; b < X.p; ++b) - zparity[b] /= zp_scratch[b]; - free(zp_scratch); - free(norm_scratch); - - return zparity; +double *maxwell_zparity(evectmatrix X, maxwell_data *d) { + int i, j, b, nxy, nz; + double *zparity, *zp_scratch, *norm_scratch; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + CHK_MALLOC(zparity, double, X.p); + CHK_MALLOC(zp_scratch, double, X.p); + for (b = 0; b < X.p; ++b) + zp_scratch[b] = 0.0; + CHK_MALLOC(norm_scratch, double, X.p); + for (b = 0; b < X.p; ++b) + norm_scratch[b] = 0.0; + + if (d->nz > 1) { + nxy = d->other_dims; + nz = d->last_dim; + } + else { + nxy = d->other_dims * d->last_dim; + nz = 1; + } + + for (i = 0; i < nxy; ++i) + for (j = 0; 2 * j <= nz; ++j) { + int ij = i * nz + j; + int ij2 = i * nz + (j > 0 ? nz - j : 0); + for (b = 0; b < X.p; ++b) { + scalar u, v, u2, v2; + u = X.data[(ij * 2) * X.p + b]; + v = X.data[(ij * 2 + 1) * X.p + b]; + u2 = X.data[(ij2 * 2) * X.p + b]; + v2 = X.data[(ij2 * 2 + 1) * X.p + b]; + zp_scratch[b] += + (ij == ij2 ? 1.0 : 2.0) * (SCALAR_RE(u) * SCALAR_RE(u2) + SCALAR_IM(u) * SCALAR_IM(u2) - + SCALAR_RE(v) * SCALAR_RE(v2) - SCALAR_IM(v) * SCALAR_IM(v2)); + norm_scratch[b] += + (ij == ij2 ? 1.0 : 2.0) * (SCALAR_RE(u) * SCALAR_RE(u) + SCALAR_IM(u) * SCALAR_IM(u) + + SCALAR_RE(v) * SCALAR_RE(v) + SCALAR_IM(v) * SCALAR_IM(v)); + } + } + + mpi_allreduce(zp_scratch, zparity, X.p, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + mpi_allreduce(norm_scratch, zp_scratch, X.p, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + for (b = 0; b < X.p; ++b) + zparity[b] /= zp_scratch[b]; + free(zp_scratch); + free(norm_scratch); + + return zparity; } /**************************************************************************/ @@ -197,52 +182,48 @@ double *maxwell_zparity(evectmatrix X, maxwell_data *d) for only one parity of states (the projection operator, like the mirror flip operator, commutes with the Maxwell operator, so this projection should not slow convergence). */ -void maxwell_yparity_constraint(evectmatrix X, void *data) -{ - maxwell_data *d = (maxwell_data *) data; - int i, j, k, b, nx, ny, nz; - int yparity = ((d->parity & EVEN_Y_PARITY) ? +1 : - ((d->parity & ODD_Y_PARITY) ? -1 : 0)); - - if (yparity == 0) - return; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - nx = d->local_nx; - ny = d->ny; - nz = d->nz; - - for (i = 0; i < nx; ++i) { - for (j = 0; 2*j <= ny; ++j) { - int ij = i * ny + j; - int ij2 = i * ny + (j > 0 ? ny - j : 0); - for (k = 0; k < nz; ++k) { - int ijk = ij * nz + k; - int ijk2 = ij2 * nz + k; - for (b = 0; b < X.p; ++b) { - scalar u,v, u2,v2; - u = X.data[(ijk * 2) * X.p + b]; - v = X.data[(ijk * 2 + 1) * X.p + b]; - u2 = X.data[(ijk2 * 2) * X.p + b]; - v2 = X.data[(ijk2 * 2 + 1) * X.p + b]; - ASSIGN_SCALAR(X.data[(ijk * 2) * X.p + b], - 0.5*(SCALAR_RE(u) - yparity*SCALAR_RE(u2)), - 0.5*(SCALAR_IM(u) - yparity*SCALAR_IM(u2))); - ASSIGN_SCALAR(X.data[(ijk * 2 + 1) * X.p + b], - 0.5*(SCALAR_RE(v) + yparity*SCALAR_RE(v2)), - 0.5*(SCALAR_IM(v) + yparity*SCALAR_IM(v2))); - ASSIGN_SCALAR(X.data[(ijk2 * 2) * X.p + b], - 0.5*(SCALAR_RE(u2) - yparity*SCALAR_RE(u)), - 0.5*(SCALAR_IM(u2) - yparity*SCALAR_IM(u))); - ASSIGN_SCALAR(X.data[(ijk2 * 2 + 1) * X.p + b], - 0.5*(SCALAR_RE(v2) + yparity*SCALAR_RE(v)), - 0.5*(SCALAR_IM(v2) + yparity*SCALAR_IM(v))); - } - } - } - } +void maxwell_yparity_constraint(evectmatrix X, void *data) { + maxwell_data *d = (maxwell_data *)data; + int i, j, k, b, nx, ny, nz; + int yparity = ((d->parity & EVEN_Y_PARITY) ? +1 : ((d->parity & ODD_Y_PARITY) ? -1 : 0)); + + if (yparity == 0) return; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + nx = d->local_nx; + ny = d->ny; + nz = d->nz; + + for (i = 0; i < nx; ++i) { + for (j = 0; 2 * j <= ny; ++j) { + int ij = i * ny + j; + int ij2 = i * ny + (j > 0 ? ny - j : 0); + for (k = 0; k < nz; ++k) { + int ijk = ij * nz + k; + int ijk2 = ij2 * nz + k; + for (b = 0; b < X.p; ++b) { + scalar u, v, u2, v2; + u = X.data[(ijk * 2) * X.p + b]; + v = X.data[(ijk * 2 + 1) * X.p + b]; + u2 = X.data[(ijk2 * 2) * X.p + b]; + v2 = X.data[(ijk2 * 2 + 1) * X.p + b]; + ASSIGN_SCALAR(X.data[(ijk * 2) * X.p + b], 0.5 * (SCALAR_RE(u) - yparity * SCALAR_RE(u2)), + 0.5 * (SCALAR_IM(u) - yparity * SCALAR_IM(u2))); + ASSIGN_SCALAR(X.data[(ijk * 2 + 1) * X.p + b], + 0.5 * (SCALAR_RE(v) + yparity * SCALAR_RE(v2)), + 0.5 * (SCALAR_IM(v) + yparity * SCALAR_IM(v2))); + ASSIGN_SCALAR(X.data[(ijk2 * 2) * X.p + b], + 0.5 * (SCALAR_RE(u2) - yparity * SCALAR_RE(u)), + 0.5 * (SCALAR_IM(u2) - yparity * SCALAR_IM(u))); + ASSIGN_SCALAR(X.data[(ijk2 * 2 + 1) * X.p + b], + 0.5 * (SCALAR_RE(v2) + yparity * SCALAR_RE(v)), + 0.5 * (SCALAR_IM(v2) + yparity * SCALAR_IM(v))); + } + } + } + } } /* Compute the parity of all of the states in X, returning an array @@ -251,148 +232,129 @@ void maxwell_yparity_constraint(evectmatrix X, void *data) of the mirror flip operator, and will be +1/-1 for even/odd eigenstates and something in between for everything else. Assumes that the columns of X are normalized to 1. */ -double *maxwell_yparity(evectmatrix X, maxwell_data *d) -{ - int i, j, k, b, nx, ny, nz; - double *yparity, *yp_scratch, *norm_scratch; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - CHK_MALLOC(yparity, double, X.p); - CHK_MALLOC(yp_scratch, double, X.p); - for (b = 0; b < X.p; ++b) - yp_scratch[b] = 0.0; - CHK_MALLOC(norm_scratch, double, X.p); - for (b = 0; b < X.p; ++b) - norm_scratch[b] = 0.0; - - nx = d->local_nx; - ny = d->ny; - nz = d->nz; - - for (i = 0; i < nx; ++i) { - for (j = 0; 2*j <= ny; ++j) { - int ij = i * ny + j; - int ij2 = i * ny + (j > 0 ? ny - j : 0); - for (k = 0; k < nz; ++k) { - int ijk = ij * nz + k; - int ijk2 = ij2 * nz + k; - for (b = 0; b < X.p; ++b) { - scalar u,v, u2,v2; - u = X.data[(ijk * 2) * X.p + b]; - v = X.data[(ijk * 2 + 1) * X.p + b]; - u2 = X.data[(ijk2 * 2) * X.p + b]; - v2 = X.data[(ijk2 * 2 + 1) * X.p + b]; - yp_scratch[b] += (ijk == ijk2 ? 1.0 : 2.0) * - (SCALAR_RE(v) * SCALAR_RE(v2) + - SCALAR_IM(v) * SCALAR_IM(v2) - - SCALAR_RE(u) * SCALAR_RE(u2) - - SCALAR_IM(u) * SCALAR_IM(u2)); - norm_scratch[b] += (ijk == ijk2 ? 1.0 : 2.0) * - (SCALAR_RE(v) * SCALAR_RE(v) + - SCALAR_IM(v) * SCALAR_IM(v) + - SCALAR_RE(u) * SCALAR_RE(u) + - SCALAR_IM(u) * SCALAR_IM(u)); - } - } - } - } - - mpi_allreduce(yp_scratch, yparity, X.p, - double, MPI_DOUBLE, MPI_SUM, mpb_comm); - mpi_allreduce(norm_scratch, yp_scratch, X.p, - double, MPI_DOUBLE, MPI_SUM, mpb_comm); - for (b = 0; b < X.p; ++b) - yparity[b] /= yp_scratch[b]; - free(yp_scratch); - free(norm_scratch); - - return yparity; +double *maxwell_yparity(evectmatrix X, maxwell_data *d) { + int i, j, k, b, nx, ny, nz; + double *yparity, *yp_scratch, *norm_scratch; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + CHK_MALLOC(yparity, double, X.p); + CHK_MALLOC(yp_scratch, double, X.p); + for (b = 0; b < X.p; ++b) + yp_scratch[b] = 0.0; + CHK_MALLOC(norm_scratch, double, X.p); + for (b = 0; b < X.p; ++b) + norm_scratch[b] = 0.0; + + nx = d->local_nx; + ny = d->ny; + nz = d->nz; + + for (i = 0; i < nx; ++i) { + for (j = 0; 2 * j <= ny; ++j) { + int ij = i * ny + j; + int ij2 = i * ny + (j > 0 ? ny - j : 0); + for (k = 0; k < nz; ++k) { + int ijk = ij * nz + k; + int ijk2 = ij2 * nz + k; + for (b = 0; b < X.p; ++b) { + scalar u, v, u2, v2; + u = X.data[(ijk * 2) * X.p + b]; + v = X.data[(ijk * 2 + 1) * X.p + b]; + u2 = X.data[(ijk2 * 2) * X.p + b]; + v2 = X.data[(ijk2 * 2 + 1) * X.p + b]; + yp_scratch[b] += (ijk == ijk2 ? 1.0 : 2.0) * + (SCALAR_RE(v) * SCALAR_RE(v2) + SCALAR_IM(v) * SCALAR_IM(v2) - + SCALAR_RE(u) * SCALAR_RE(u2) - SCALAR_IM(u) * SCALAR_IM(u2)); + norm_scratch[b] += (ijk == ijk2 ? 1.0 : 2.0) * + (SCALAR_RE(v) * SCALAR_RE(v) + SCALAR_IM(v) * SCALAR_IM(v) + + SCALAR_RE(u) * SCALAR_RE(u) + SCALAR_IM(u) * SCALAR_IM(u)); + } + } + } + } + + mpi_allreduce(yp_scratch, yparity, X.p, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + mpi_allreduce(norm_scratch, yp_scratch, X.p, double, MPI_DOUBLE, MPI_SUM, mpb_comm); + for (b = 0; b < X.p; ++b) + yparity[b] /= yp_scratch[b]; + free(yp_scratch); + free(norm_scratch); + + return yparity; } /**************************************************************************/ /* to fix problems with slow convergence for k ~ 0, manually "put in" the k = 0 solution: first two bands are constant and higher bands are - orthogonal. Note that in the TE/TM case, only one band is constant. + orthogonal. Note that in the TE/TM case, only one band is constant. Also note that, in Fourier space, a constant field corresponds to 1 in the DC component and 0 elsewhere. */ /* return the number of constant (zero-frequency) bands: */ -int maxwell_zero_k_num_const_bands(evectmatrix X, maxwell_data *d) -{ - int num_const_bands, m_band = 1, n_band = 1; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - if (d->parity & (ODD_Z_PARITY | EVEN_Y_PARITY)) - m_band = 0; - if (d->parity & (ODD_Y_PARITY | EVEN_Z_PARITY)) - n_band = 0; - - num_const_bands = m_band + n_band; - - if (num_const_bands > X.p) - num_const_bands = X.p; - - return num_const_bands; +int maxwell_zero_k_num_const_bands(evectmatrix X, maxwell_data *d) { + int num_const_bands, m_band = 1, n_band = 1; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + if (d->parity & (ODD_Z_PARITY | EVEN_Y_PARITY)) m_band = 0; + if (d->parity & (ODD_Y_PARITY | EVEN_Z_PARITY)) n_band = 0; + + num_const_bands = m_band + n_band; + + if (num_const_bands > X.p) num_const_bands = X.p; + + return num_const_bands; } -void maxwell_zero_k_set_const_bands(evectmatrix X, maxwell_data *d) -{ - int i, j, num_const_bands, m_band = 1, n_band = 1; - - CHECK(d, "null maxwell data pointer!"); - CHECK(X.c == 2, "fields don't have 2 components!"); - - if (X.p < 1) - return; - - num_const_bands = maxwell_zero_k_num_const_bands(X, d); - - /* Initialize num_const_bands to zero: */ - for (i = 0; i < X.n; ++i) - for (j = 0; j < num_const_bands; ++j) { - ASSIGN_ZERO(X.data[i * X.p + j]); - } - - if (X.Nstart > 0) - return; /* DC frequency is not on this process */ - - /* Set DC components to 1 (in two parities) for num_const_bands: */ - - if (d->parity & (ODD_Z_PARITY | EVEN_Y_PARITY)) - m_band = 0; - if (d->parity & (ODD_Y_PARITY | EVEN_Z_PARITY)) - n_band = 0; - - if (m_band) { - ASSIGN_SCALAR(X.data[0], 1.0, 0.0); - ASSIGN_SCALAR(X.data[X.p], 0.0, 0.0); - } - if (n_band && (!m_band || X.p >= 2)) { - ASSIGN_SCALAR(X.data[m_band], 0.0, 0.0); - ASSIGN_SCALAR(X.data[X.p + m_band], 1.0, 0.0); - } +void maxwell_zero_k_set_const_bands(evectmatrix X, maxwell_data *d) { + int i, j, num_const_bands, m_band = 1, n_band = 1; + + CHECK(d, "null maxwell data pointer!"); + CHECK(X.c == 2, "fields don't have 2 components!"); + + if (X.p < 1) return; + + num_const_bands = maxwell_zero_k_num_const_bands(X, d); + + /* Initialize num_const_bands to zero: */ + for (i = 0; i < X.n; ++i) + for (j = 0; j < num_const_bands; ++j) { + ASSIGN_ZERO(X.data[i * X.p + j]); + } + + if (X.Nstart > 0) return; /* DC frequency is not on this process */ + + /* Set DC components to 1 (in two parities) for num_const_bands: */ + + if (d->parity & (ODD_Z_PARITY | EVEN_Y_PARITY)) m_band = 0; + if (d->parity & (ODD_Y_PARITY | EVEN_Z_PARITY)) n_band = 0; + + if (m_band) { + ASSIGN_SCALAR(X.data[0], 1.0, 0.0); + ASSIGN_SCALAR(X.data[X.p], 0.0, 0.0); + } + if (n_band && (!m_band || X.p >= 2)) { + ASSIGN_SCALAR(X.data[m_band], 0.0, 0.0); + ASSIGN_SCALAR(X.data[X.p + m_band], 1.0, 0.0); + } } /* during eigensolution (for upper bands), their DC components are constrained to be zero */ -void maxwell_zero_k_constraint(evectmatrix X, void *data) -{ - int j; - - if (X.Nstart > 0) - return; /* DC frequency is not on this process */ - - for (j = 0; j < X.p; ++j) { - ASSIGN_ZERO(X.data[j]); - ASSIGN_ZERO(X.data[X.p + j]); - } - (void)data; /* avoid warning about unused parameter */ +void maxwell_zero_k_constraint(evectmatrix X, void *data) { + int j; + + if (X.Nstart > 0) return; /* DC frequency is not on this process */ + + for (j = 0; j < X.p; ++j) { + ASSIGN_ZERO(X.data[j]); + ASSIGN_ZERO(X.data[X.p + j]); + } + (void)data; /* avoid warning about unused parameter */ } /**************************************************************************/ - diff --git a/src/maxwell/maxwell_eps.c b/src/maxwell/maxwell_eps.c index a165fc90..4a77a06e 100644 --- a/src/maxwell/maxwell_eps.c +++ b/src/maxwell/maxwell_eps.c @@ -32,240 +32,225 @@ /* Lapack eigenvalue functions */ #ifdef SCALAR_SINGLE_PREC -# define HEEV F77_FUNC(cheev,CHEEV) -# define SYEV F77_FUNC(ssyev,SSYEV) +#define HEEV F77_FUNC(cheev, CHEEV) +#define SYEV F77_FUNC(ssyev, SSYEV) #else -# define HEEV F77_FUNC(zheev,ZHEEV) -# define SYEV F77_FUNC(dsyev,DSYEV) +#define HEEV F77_FUNC(zheev, ZHEEV) +#define SYEV F77_FUNC(dsyev, DSYEV) #endif -extern void HEEV(char *, char *, int *, scalar_complex *, int *, real *, - scalar_complex *, int *, real *, int *); -extern void SYEV(char *, char *, int *, real *, int *, real *, - real *, int *, int *); +extern void HEEV(char *, char *, int *, scalar_complex *, int *, real *, scalar_complex *, int *, + real *, int *); +extern void SYEV(char *, char *, int *, real *, int *, real *, real *, int *, int *); /* compute the 3 real eigenvalues of the matrix V */ -void maxwell_sym_matrix_eigs(real eigs[3], const symmetric_matrix *V) -{ - int n = 3, nw = 9, info; +void maxwell_sym_matrix_eigs(real eigs[3], const symmetric_matrix *V) { + int n = 3, nw = 9, info; #if defined(WITH_HERMITIAN_EPSILON) - scalar_complex Vm[3][3], W[9]; - real W2[9]; - CASSIGN_SCALAR(Vm[0][0], V->m00, 0); - CASSIGN_SCALAR(Vm[1][1], V->m11, 0); - CASSIGN_SCALAR(Vm[2][2], V->m22, 0); - Vm[0][1] = V->m01; CASSIGN_CONJ(Vm[1][0], V->m01); - Vm[0][2] = V->m02; CASSIGN_CONJ(Vm[2][0], V->m02); - Vm[1][2] = V->m12; CASSIGN_CONJ(Vm[2][1], V->m12); - HEEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, W2, &info); + scalar_complex Vm[3][3], W[9]; + real W2[9]; + CASSIGN_SCALAR(Vm[0][0], V->m00, 0); + CASSIGN_SCALAR(Vm[1][1], V->m11, 0); + CASSIGN_SCALAR(Vm[2][2], V->m22, 0); + Vm[0][1] = V->m01; + CASSIGN_CONJ(Vm[1][0], V->m01); + Vm[0][2] = V->m02; + CASSIGN_CONJ(Vm[2][0], V->m02); + Vm[1][2] = V->m12; + CASSIGN_CONJ(Vm[2][1], V->m12); + HEEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, W2, &info); #else - real Vm[3][3], W[9]; - Vm[0][0] = V->m00; - Vm[1][1] = V->m11; - Vm[2][2] = V->m22; - Vm[0][1] = Vm[1][0] = V->m01; - Vm[0][2] = Vm[2][0] = V->m02; - Vm[1][2] = Vm[2][1] = V->m12; - SYEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, &info); + real Vm[3][3], W[9]; + Vm[0][0] = V->m00; + Vm[1][1] = V->m11; + Vm[2][2] = V->m22; + Vm[0][1] = Vm[1][0] = V->m01; + Vm[0][2] = Vm[2][0] = V->m02; + Vm[1][2] = Vm[2][1] = V->m12; + SYEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, &info); #endif - CHECK(info >= 0, "invalid argument in heev"); - CHECK(info <= 0, "failure to converge in heev"); + CHECK(info >= 0, "invalid argument in heev"); + CHECK(info <= 0, "failure to converge in heev"); } /* Set Vinv = inverse of V, where both V and Vinv are real-symmetric (or possibly complex-Hermitian) matrices. */ -void maxwell_sym_matrix_invert(symmetric_matrix *Vinv, - const symmetric_matrix *V) -{ - real m00 = V->m00, m11 = V->m11, m22 = V->m22; +void maxwell_sym_matrix_invert(symmetric_matrix *Vinv, const symmetric_matrix *V) { + real m00 = V->m00, m11 = V->m11, m22 = V->m22; #if defined(WITH_HERMITIAN_EPSILON) - scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12; - - if (m01.re == 0.0 && m01.im == 0.0 && - m02.re == 0.0 && m02.im == 0.0 && - m12.re == 0.0 && m12.im == 0.0) { - /* optimize common case of a diagonal matrix: */ - Vinv->m00 = 1.0 / m00; - Vinv->m11 = 1.0 / m11; - Vinv->m22 = 1.0 / m22; - CASSIGN_ZERO(Vinv->m01); - CASSIGN_ZERO(Vinv->m02); - CASSIGN_ZERO(Vinv->m12); - } - else { - double detinv; - - /* compute the determinant: */ - detinv = m00*m11*m22 - m11*CSCALAR_NORMSQR(m02) - - CSCALAR_NORMSQR(m01)*m22 - CSCALAR_NORMSQR(m12)*m00 + - 2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re + - (m01.re * m12.im + m01.im * m12.re) * m02.im); - - CHECK(detinv != 0.0, "singular 3x3 matrix"); - - detinv = 1.0/detinv; - - Vinv->m00 = detinv * (m11*m22 - CSCALAR_NORMSQR(m12)); - Vinv->m11 = detinv * (m00*m22 - CSCALAR_NORMSQR(m02)); - Vinv->m22 = detinv * (m11*m00 - CSCALAR_NORMSQR(m01)); - - CASSIGN_SCALAR(Vinv->m02, - detinv * (m01.re*m12.re-m01.im*m12.im - m11*m02.re), - -detinv*(-m01.re*m12.im-m01.im*m12.re + m11*m02.im)); - - CASSIGN_SCALAR(Vinv->m01, - detinv * (m12.re*m02.re+m12.im*m02.im - m22*m01.re), - -detinv * (m12.im*m02.re-m12.re*m02.im + m22*m01.im)); - - CASSIGN_SCALAR(Vinv->m12, - detinv * (m01.re*m02.re+m01.im*m02.im - m00*m12.re), - -detinv * (m01.im*m02.re-m01.re*m02.im + m00*m12.im)); - } - -#else /* real matrix */ - real m01 = V->m01, m02 = V->m02, m12 = V->m12; - - if (m01 == 0.0 && m02 == 0.0 && m12 == 0.0) { - /* optimize common case of a diagonal matrix: */ - Vinv->m00 = 1.0 / m00; - Vinv->m11 = 1.0 / m11; - Vinv->m22 = 1.0 / m22; - Vinv->m01 = Vinv->m02 = Vinv->m12 = 0.0; - } - else { - double detinv; - - /* compute the determinant: */ - detinv = m00*m11*m22 - m02*m11*m02 + 2.0 * m01*m12*m02 - - m01*m01*m22 - m12*m12*m00; - - CHECK(detinv != 0.0, "singular 3x3 matrix"); - - detinv = 1.0/detinv; - - Vinv->m00 = detinv * (m11*m22 - m12*m12); - Vinv->m11 = detinv * (m00*m22 - m02*m02); - Vinv->m22 = detinv * (m11*m00 - m01*m01); - - Vinv->m02 = detinv * (m01*m12 - m11*m02); - Vinv->m01 = detinv * (m12*m02 - m01*m22); - Vinv->m12 = detinv * (m01*m02 - m00*m12); - } + scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12; + + if (m01.re == 0.0 && m01.im == 0.0 && m02.re == 0.0 && m02.im == 0.0 && m12.re == 0.0 && + m12.im == 0.0) { + /* optimize common case of a diagonal matrix: */ + Vinv->m00 = 1.0 / m00; + Vinv->m11 = 1.0 / m11; + Vinv->m22 = 1.0 / m22; + CASSIGN_ZERO(Vinv->m01); + CASSIGN_ZERO(Vinv->m02); + CASSIGN_ZERO(Vinv->m12); + } + else { + double detinv; + + /* compute the determinant: */ + detinv = m00 * m11 * m22 - m11 * CSCALAR_NORMSQR(m02) - CSCALAR_NORMSQR(m01) * m22 - + CSCALAR_NORMSQR(m12) * m00 + + 2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re + + (m01.re * m12.im + m01.im * m12.re) * m02.im); + + CHECK(detinv != 0.0, "singular 3x3 matrix"); + + detinv = 1.0 / detinv; + + Vinv->m00 = detinv * (m11 * m22 - CSCALAR_NORMSQR(m12)); + Vinv->m11 = detinv * (m00 * m22 - CSCALAR_NORMSQR(m02)); + Vinv->m22 = detinv * (m11 * m00 - CSCALAR_NORMSQR(m01)); + + CASSIGN_SCALAR(Vinv->m02, detinv * (m01.re * m12.re - m01.im * m12.im - m11 * m02.re), + -detinv * (-m01.re * m12.im - m01.im * m12.re + m11 * m02.im)); + + CASSIGN_SCALAR(Vinv->m01, detinv * (m12.re * m02.re + m12.im * m02.im - m22 * m01.re), + -detinv * (m12.im * m02.re - m12.re * m02.im + m22 * m01.im)); + + CASSIGN_SCALAR(Vinv->m12, detinv * (m01.re * m02.re + m01.im * m02.im - m00 * m12.re), + -detinv * (m01.im * m02.re - m01.re * m02.im + m00 * m12.im)); + } + +#else /* real matrix */ + real m01 = V->m01, m02 = V->m02, m12 = V->m12; + + if (m01 == 0.0 && m02 == 0.0 && m12 == 0.0) { + /* optimize common case of a diagonal matrix: */ + Vinv->m00 = 1.0 / m00; + Vinv->m11 = 1.0 / m11; + Vinv->m22 = 1.0 / m22; + Vinv->m01 = Vinv->m02 = Vinv->m12 = 0.0; + } + else { + double detinv; + + /* compute the determinant: */ + detinv = m00 * m11 * m22 - m02 * m11 * m02 + 2.0 * m01 * m12 * m02 - m01 * m01 * m22 - + m12 * m12 * m00; + + CHECK(detinv != 0.0, "singular 3x3 matrix"); + + detinv = 1.0 / detinv; + + Vinv->m00 = detinv * (m11 * m22 - m12 * m12); + Vinv->m11 = detinv * (m00 * m22 - m02 * m02); + Vinv->m22 = detinv * (m11 * m00 - m01 * m01); + + Vinv->m02 = detinv * (m01 * m12 - m11 * m02); + Vinv->m01 = detinv * (m12 * m02 - m01 * m22); + Vinv->m12 = detinv * (m01 * m02 - m00 * m12); + } #endif /* real matrix */ } /* Returns whether or not V is positive-definite. */ -int maxwell_sym_matrix_positive_definite(symmetric_matrix *V) -{ - real det2, det3; - real m00 = V->m00, m11 = V->m11, m22 = V->m22; +int maxwell_sym_matrix_positive_definite(symmetric_matrix *V) { + real det2, det3; + real m00 = V->m00, m11 = V->m11, m22 = V->m22; #if defined(WITH_HERMITIAN_EPSILON) - scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12; + scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12; - det2 = m00*m11 - CSCALAR_NORMSQR(m01); - det3 = det2*m22 - m11*CSCALAR_NORMSQR(m02) - CSCALAR_NORMSQR(m12)*m00 + - 2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re + - (m01.re * m12.im + m01.im * m12.re) * m02.im); -#else /* real matrix */ - real m01 = V->m01, m02 = V->m02, m12 = V->m12; + det2 = m00 * m11 - CSCALAR_NORMSQR(m01); + det3 = det2 * m22 - m11 * CSCALAR_NORMSQR(m02) - CSCALAR_NORMSQR(m12) * m00 + + 2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re + + (m01.re * m12.im + m01.im * m12.re) * m02.im); +#else /* real matrix */ + real m01 = V->m01, m02 = V->m02, m12 = V->m12; - det2 = m00*m11 - m01*m01; - det3 = det2*m22 - m02*m11*m02 + 2.0 * m01*m12*m02 - m12*m12*m00; + det2 = m00 * m11 - m01 * m01; + det3 = det2 * m22 - m02 * m11 * m02 + 2.0 * m01 * m12 * m02 - m12 * m12 * m00; #endif /* real matrix */ - return (m00 > 0.0 && det2 > 0.0 && det3 > 0.0); + return (m00 > 0.0 && det2 > 0.0 && det3 > 0.0); } -#define EQ(x1,x2) (fabs((x1) - (x2)) < tol) -static int sym_matrix_eq(symmetric_matrix V1, symmetric_matrix V2, double tol) -{ - if (!EQ(V1.m00,V2.m00) || !EQ(V1.m11,V2.m11) || !EQ(V1.m22,V2.m22)) - return 0; +#define EQ(x1, x2) (fabs((x1) - (x2)) < tol) +static int sym_matrix_eq(symmetric_matrix V1, symmetric_matrix V2, double tol) { + if (!EQ(V1.m00, V2.m00) || !EQ(V1.m11, V2.m11) || !EQ(V1.m22, V2.m22)) return 0; #if defined(WITH_HERMITIAN_EPSILON) - return(EQ(V1.m01.re,V2.m01.re) && EQ(V1.m01.im,V2.m01.im) && - EQ(V1.m02.re,V2.m02.re) && EQ(V1.m02.im,V2.m02.im) && - EQ(V1.m12.re,V2.m12.re) && EQ(V1.m12.im,V2.m12.im)); + return (EQ(V1.m01.re, V2.m01.re) && EQ(V1.m01.im, V2.m01.im) && EQ(V1.m02.re, V2.m02.re) && + EQ(V1.m02.im, V2.m02.im) && EQ(V1.m12.re, V2.m12.re) && EQ(V1.m12.im, V2.m12.im)); #else - return(EQ(V1.m01,V2.m01) && EQ(V1.m02,V2.m02) && EQ(V1.m12,V2.m12)); + return (EQ(V1.m01, V2.m01) && EQ(V1.m02, V2.m02) && EQ(V1.m12, V2.m12)); #endif } /* rotate A by a unitary (real) rotation matrix R: RAR = transpose(R) * A * R */ -void maxwell_sym_matrix_rotate(symmetric_matrix *RAR, - const symmetric_matrix *A_, - double R[3][3]) -{ - int i,j; - double A[3][3], AR[3][3]; - A[0][0] = A_->m00; - A[1][1] = A_->m11; - A[2][2] = A_->m22; - A[0][1] = A[1][0] = ESCALAR_RE(A_->m01); - A[0][2] = A[2][0] = ESCALAR_RE(A_->m02); - A[1][2] = A[2][1] = ESCALAR_RE(A_->m12); - for (i = 0; i < 3; ++i) for (j = 0; j < 3; ++j) - AR[i][j] = A[i][0]*R[0][j] + A[i][1]*R[1][j] + A[i][2]*R[2][j]; - for (i = 0; i < 3; ++i) for (j = i; j < 3; ++j) - A[i][j] = R[0][i]*AR[0][j] + R[1][i]*AR[1][j] + R[2][i]*AR[2][j]; - RAR->m00 = A[0][0]; - RAR->m11 = A[1][1]; - RAR->m22 = A[2][2]; - ESCALAR_RE(RAR->m01) = A[0][1]; - ESCALAR_RE(RAR->m02) = A[0][2]; - ESCALAR_RE(RAR->m12) = A[1][2]; +void maxwell_sym_matrix_rotate(symmetric_matrix *RAR, const symmetric_matrix *A_, double R[3][3]) { + int i, j; + double A[3][3], AR[3][3]; + A[0][0] = A_->m00; + A[1][1] = A_->m11; + A[2][2] = A_->m22; + A[0][1] = A[1][0] = ESCALAR_RE(A_->m01); + A[0][2] = A[2][0] = ESCALAR_RE(A_->m02); + A[1][2] = A[2][1] = ESCALAR_RE(A_->m12); + for (i = 0; i < 3; ++i) + for (j = 0; j < 3; ++j) + AR[i][j] = A[i][0] * R[0][j] + A[i][1] * R[1][j] + A[i][2] * R[2][j]; + for (i = 0; i < 3; ++i) + for (j = i; j < 3; ++j) + A[i][j] = R[0][i] * AR[0][j] + R[1][i] * AR[1][j] + R[2][i] * AR[2][j]; + RAR->m00 = A[0][0]; + RAR->m11 = A[1][1]; + RAR->m22 = A[2][2]; + ESCALAR_RE(RAR->m01) = A[0][1]; + ESCALAR_RE(RAR->m02) = A[0][2]; + ESCALAR_RE(RAR->m12) = A[1][2]; #if defined(WITH_HERMITIAN_EPSILON) - A[0][0] = A[1][1] = A[2][2] = 0; - A[1][0] = -(A[0][1] = ESCALAR_IM(A_->m01)); - A[2][0] = -(A[0][2] = ESCALAR_IM(A_->m02)); - A[2][1] = -(A[1][2] = ESCALAR_IM(A_->m12)); - for (i = 0; i < 3; ++i) for (j = 0; j < 3; ++j) - AR[i][j] = A[i][0]*R[0][j] + A[i][1]*R[1][j] + A[i][2]*R[2][j]; - for (i = 0; i < 3; ++i) for (j = i; j < 3; ++j) - A[i][j] = R[0][i]*AR[0][j] + R[1][i]*AR[1][j] + R[2][i]*AR[2][j]; - ESCALAR_IM(RAR->m01) = A[0][1]; - ESCALAR_IM(RAR->m02) = A[0][2]; - ESCALAR_IM(RAR->m12) = A[1][2]; + A[0][0] = A[1][1] = A[2][2] = 0; + A[1][0] = -(A[0][1] = ESCALAR_IM(A_->m01)); + A[2][0] = -(A[0][2] = ESCALAR_IM(A_->m02)); + A[2][1] = -(A[1][2] = ESCALAR_IM(A_->m12)); + for (i = 0; i < 3; ++i) + for (j = 0; j < 3; ++j) + AR[i][j] = A[i][0] * R[0][j] + A[i][1] * R[1][j] + A[i][2] * R[2][j]; + for (i = 0; i < 3; ++i) + for (j = i; j < 3; ++j) + A[i][j] = R[0][i] * AR[0][j] + R[1][i] * AR[1][j] + R[2][i] * AR[2][j]; + ESCALAR_IM(RAR->m01) = A[0][1]; + ESCALAR_IM(RAR->m02) = A[0][2]; + ESCALAR_IM(RAR->m12) = A[1][2]; #endif } /**************************************************************************/ -int check_maxwell_dielectric(maxwell_data *d, - int negative_epsilon_okp) -{ - int i, require_2d; +int check_maxwell_dielectric(maxwell_data *d, int negative_epsilon_okp) { + int i, require_2d; - require_2d = d->nz == 1 && (d->parity & (EVEN_Z_PARITY | ODD_Z_PARITY)); + require_2d = d->nz == 1 && (d->parity & (EVEN_Z_PARITY | ODD_Z_PARITY)); - for (i = 0; i < d->fft_output_size; ++i) { - if (!negative_epsilon_okp && - !maxwell_sym_matrix_positive_definite(d->eps_inv + i)) - return 1; - if (require_2d) { + for (i = 0; i < d->fft_output_size; ++i) { + if (!negative_epsilon_okp && !maxwell_sym_matrix_positive_definite(d->eps_inv + i)) return 1; + if (require_2d) { #if defined(WITH_HERMITIAN_EPSILON) - if (d->eps_inv[i].m02.re != 0.0 || - d->eps_inv[i].m02.im != 0.0 || - d->eps_inv[i].m12.re != 0.0 || - d->eps_inv[i].m12.im != 0.0) - return 2; -#else /* real matrix */ - if (d->eps_inv[i].m02 != 0.0 || d->eps_inv[i].m12 != 0.0) - return 2; + if (d->eps_inv[i].m02.re != 0.0 || d->eps_inv[i].m02.im != 0.0 || + d->eps_inv[i].m12.re != 0.0 || d->eps_inv[i].m12.im != 0.0) + return 2; +#else /* real matrix */ + if (d->eps_inv[i].m02 != 0.0 || d->eps_inv[i].m12 != 0.0) return 2; #endif /* real matrix */ - } - } - return 0; + } + } + return 0; } /**************************************************************************/ #define K_PI 3.141592653589793238462643383279502884197 #define SMALL 1.0e-6 -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) #define MAX_MOMENT_MESH NQUAD /* max # of moment-mesh vectors */ #define MOMENT_MESH_R 0.5 @@ -281,79 +266,74 @@ int check_maxwell_dielectric(maxwell_data *d, moment_mesh: an array of size_moment_mesh vectors, in lattice coordinates, of a "spherically-symmetric" mesh of points centered on the origin, designed to be - used for averaging the first moment of epsilon at - a grid point (for finding the local surface normal). + used for averaging the first moment of epsilon at + a grid point (for finding the local surface normal). moment_mesh_weights: an array of size_moment_mesh weights to multiply the integrand values by. */ -static void get_mesh(int nx, int ny, int nz, const int mesh_size[3], - real R[3][3], real G[3][3], - real mesh_center[3], int *mesh_prod, - real moment_mesh[MAX_MOMENT_MESH][3], - real moment_mesh_weights[MAX_MOMENT_MESH], - int *size_moment_mesh) -{ - int i,j; - real min_diam = 1e20; - real mesh_total[3] = { 0, 0, 0 }; - int rank = nz > 1 ? 3 : (ny > 1 ? 2 : 1); - real weight_sum = 0.0; - - *mesh_prod = 1; - for (i = 0; i < 3; ++i) { - int ms = MAX2(mesh_size[i], 1); - mesh_center[i] = (ms - 1) * 0.5; - *mesh_prod *= ms; - } - - *size_moment_mesh = num_sphere_quad[rank-1]; - for (i = 0; i < num_sphere_quad[rank-1]; ++i) { - for (j = 0; j < 3; ++j) - moment_mesh[i][j] = sphere_quad[rank-1][i][j] * MOMENT_MESH_R; - moment_mesh_weights[i] = sphere_quad[rank-1][i][3]; - } - - CHECK(*size_moment_mesh <= MAX_MOMENT_MESH, "yikes, moment mesh too big"); - - for (i = 0; i < *size_moment_mesh; ++i) - weight_sum += moment_mesh_weights[i]; - CHECK(fabs(weight_sum - 1.0) < SMALL, "bug, incorrect moment weights"); - - /* scale the moment-mesh vectors so that the sphere has a - diameter of 2*MOMENT_MESH_R times the diameter of the - smallest grid direction: */ - - /* first, find the minimum distance between grid points along the - lattice directions (should we use the maximum instead?): */ - for (i = 0; i < rank; ++i) { - real ri = R[i][0] * R[i][0] + R[i][1] * R[i][1] + R[i][2] * R[i][2]; - ri = sqrt(ri) / (i == 0 ? nx : (i == 1 ? ny : nz)); - min_diam = MIN2(min_diam, ri); - } - - /* scale moment_mesh by this diameter: */ - for (i = 0; i < *size_moment_mesh; ++i) { - real len = 0; - for (j = 0; j < 3; ++j) { - moment_mesh[i][j] *= min_diam; - len += moment_mesh[i][j] * moment_mesh[i][j]; - mesh_total[j] += moment_mesh[i][j]; - } - CHECK(fabs(len - min_diam*min_diam*(MOMENT_MESH_R*MOMENT_MESH_R)) - < SMALL, - "bug in get_mesh: moment_mesh vector is wrong length"); - } - CHECK(fabs(mesh_total[0]) + fabs(mesh_total[1]) + fabs(mesh_total[2]) - < SMALL, "bug in get_mesh: moment_mesh does not average to zero"); - - /* Now, convert the moment_mesh vectors to lattice/grid coordinates; - to do this, we multiply by the G matrix (inverse of R transposed) */ - for (i = 0; i < *size_moment_mesh; ++i) { - real v[3]; - for (j = 0; j < 3; ++j) /* make a copy of moment_mesh[i] */ - v[j] = moment_mesh[i][j]; - for (j = 0; j < 3; ++j) - moment_mesh[i][j] = G[j][0]*v[0] + G[j][1]*v[1] + G[j][2]*v[2]; - } +static void get_mesh(int nx, int ny, int nz, const int mesh_size[3], real R[3][3], real G[3][3], + real mesh_center[3], int *mesh_prod, real moment_mesh[MAX_MOMENT_MESH][3], + real moment_mesh_weights[MAX_MOMENT_MESH], int *size_moment_mesh) { + int i, j; + real min_diam = 1e20; + real mesh_total[3] = {0, 0, 0}; + int rank = nz > 1 ? 3 : (ny > 1 ? 2 : 1); + real weight_sum = 0.0; + + *mesh_prod = 1; + for (i = 0; i < 3; ++i) { + int ms = MAX2(mesh_size[i], 1); + mesh_center[i] = (ms - 1) * 0.5; + *mesh_prod *= ms; + } + + *size_moment_mesh = num_sphere_quad[rank - 1]; + for (i = 0; i < num_sphere_quad[rank - 1]; ++i) { + for (j = 0; j < 3; ++j) + moment_mesh[i][j] = sphere_quad[rank - 1][i][j] * MOMENT_MESH_R; + moment_mesh_weights[i] = sphere_quad[rank - 1][i][3]; + } + + CHECK(*size_moment_mesh <= MAX_MOMENT_MESH, "yikes, moment mesh too big"); + + for (i = 0; i < *size_moment_mesh; ++i) + weight_sum += moment_mesh_weights[i]; + CHECK(fabs(weight_sum - 1.0) < SMALL, "bug, incorrect moment weights"); + + /* scale the moment-mesh vectors so that the sphere has a + diameter of 2*MOMENT_MESH_R times the diameter of the + smallest grid direction: */ + + /* first, find the minimum distance between grid points along the + lattice directions (should we use the maximum instead?): */ + for (i = 0; i < rank; ++i) { + real ri = R[i][0] * R[i][0] + R[i][1] * R[i][1] + R[i][2] * R[i][2]; + ri = sqrt(ri) / (i == 0 ? nx : (i == 1 ? ny : nz)); + min_diam = MIN2(min_diam, ri); + } + + /* scale moment_mesh by this diameter: */ + for (i = 0; i < *size_moment_mesh; ++i) { + real len = 0; + for (j = 0; j < 3; ++j) { + moment_mesh[i][j] *= min_diam; + len += moment_mesh[i][j] * moment_mesh[i][j]; + mesh_total[j] += moment_mesh[i][j]; + } + CHECK(fabs(len - min_diam * min_diam * (MOMENT_MESH_R * MOMENT_MESH_R)) < SMALL, + "bug in get_mesh: moment_mesh vector is wrong length"); + } + CHECK(fabs(mesh_total[0]) + fabs(mesh_total[1]) + fabs(mesh_total[2]) < SMALL, + "bug in get_mesh: moment_mesh does not average to zero"); + + /* Now, convert the moment_mesh vectors to lattice/grid coordinates; + to do this, we multiply by the G matrix (inverse of R transposed) */ + for (i = 0; i < *size_moment_mesh; ++i) { + real v[3]; + for (j = 0; j < 3; ++j) /* make a copy of moment_mesh[i] */ + v[j] = moment_mesh[i][j]; + for (j = 0; j < 3; ++j) + moment_mesh[i][j] = G[j][0] * v[0] + G[j][1] * v[1] + G[j][2] * v[2]; + } } /**************************************************************************/ @@ -380,370 +360,344 @@ static void get_mesh(int nx, int ny, int nz, const int mesh_size[3], the output of the FFT. Thus, its dimensions depend upon whether we are doing a real or complex and serial or parallel FFT. */ -void set_maxwell_dielectric(maxwell_data *md, - const int mesh_size[3], - real R[3][3], real G[3][3], - maxwell_dielectric_function epsilon, - maxwell_dielectric_mean_function mepsilon, - void *epsilon_data) -{ - real s1, s2, s3, m1, m2, m3; /* grid/mesh steps */ - real mesh_center[3]; - real moment_mesh[MAX_MOMENT_MESH][3]; - real moment_mesh_weights[MAX_MOMENT_MESH]; - real eps_inv_total = 0.0; - int i, j, k; - int mesh_prod; - real mesh_prod_inv; - int size_moment_mesh = 0; - int n1, n2, n3; +void set_maxwell_dielectric(maxwell_data *md, const int mesh_size[3], real R[3][3], real G[3][3], + maxwell_dielectric_function epsilon, + maxwell_dielectric_mean_function mepsilon, void *epsilon_data) { + real s1, s2, s3, m1, m2, m3; /* grid/mesh steps */ + real mesh_center[3]; + real moment_mesh[MAX_MOMENT_MESH][3]; + real moment_mesh_weights[MAX_MOMENT_MESH]; + real eps_inv_total = 0.0; + int i, j, k; + int mesh_prod; + real mesh_prod_inv; + int size_moment_mesh = 0; + int n1, n2, n3; #ifdef HAVE_MPI - int local_n2, local_y_start, local_n3; + int local_n2, local_y_start, local_n3; #endif #ifndef SCALAR_COMPLEX - int n_other, n_last, rank; + int n_other, n_last, rank; #endif - n1 = md->nx; n2 = md->ny; n3 = md->nz; + n1 = md->nx; + n2 = md->ny; + n3 = md->nz; - get_mesh(n1, n2, n3, mesh_size, R, G, - mesh_center, &mesh_prod, moment_mesh, moment_mesh_weights, - &size_moment_mesh); - mesh_prod_inv = 1.0 / mesh_prod; + get_mesh(n1, n2, n3, mesh_size, R, G, mesh_center, &mesh_prod, moment_mesh, moment_mesh_weights, + &size_moment_mesh); + mesh_prod_inv = 1.0 / mesh_prod; - s1 = 1.0 / n1; - s2 = 1.0 / n2; - s3 = 1.0 / n3; - m1 = s1 / MAX2(1, mesh_size[0]); - m2 = s2 / MAX2(1, mesh_size[1]); - m3 = s3 / MAX2(1, mesh_size[2]); + s1 = 1.0 / n1; + s2 = 1.0 / n2; + s3 = 1.0 / n3; + m1 = s1 / MAX2(1, mesh_size[0]); + m2 = s2 / MAX2(1, mesh_size[1]); + m3 = s3 / MAX2(1, mesh_size[2]); - LOOP_XYZ(md) { - int mi, mj, mk; + LOOP_XYZ(md) { + int mi, mj, mk; #ifdef WITH_HERMITIAN_EPSILON - symmetric_matrix eps_mean, eps_inv_mean, eps_mean_inv; + symmetric_matrix eps_mean, eps_inv_mean, eps_mean_inv; #else - symmetric_matrix eps_mean, eps_inv_mean, eps_mean_inv; + symmetric_matrix eps_mean, eps_inv_mean, eps_mean_inv; #endif - real norm_len; - real norm0, norm1, norm2; - short means_different_p, diag_eps_p; - - { - real r[3], normal[3]; - r[0] = i1 * s1; - r[1] = i2 * s2; - r[2] = i3 * s3; - if (mepsilon && mepsilon(&eps_mean, &eps_inv_mean, normal, - s1, s2, s3, mesh_prod_inv, - r, epsilon_data)) { - - maxwell_sym_matrix_invert(md->eps_inv + xyz_index, - &eps_mean); - goto got_eps_inv; - - norm0 = R[0][0] * normal[0] + R[1][0] * normal[1] - + R[2][0] * normal[2]; - norm1 = R[0][1] * normal[0] + R[1][1] * normal[1] - + R[2][1] * normal[2]; - norm2 = R[0][2] * normal[0] + R[1][2] * normal[1] - + R[2][2] * normal[2]; - means_different_p = 1; - diag_eps_p = DIAG_SYMMETRIC_MATRIX(eps_mean); - maxwell_sym_matrix_invert(&eps_mean_inv, &eps_mean); + real norm_len; + real norm0, norm1, norm2; + short means_different_p, diag_eps_p; + + { + real r[3], normal[3]; + r[0] = i1 * s1; + r[1] = i2 * s2; + r[2] = i3 * s3; + if (mepsilon && + mepsilon(&eps_mean, &eps_inv_mean, normal, s1, s2, s3, mesh_prod_inv, r, epsilon_data)) { + + maxwell_sym_matrix_invert(md->eps_inv + xyz_index, &eps_mean); + goto got_eps_inv; + + norm0 = R[0][0] * normal[0] + R[1][0] * normal[1] + R[2][0] * normal[2]; + norm1 = R[0][1] * normal[0] + R[1][1] * normal[1] + R[2][1] * normal[2]; + norm2 = R[0][2] * normal[0] + R[1][2] * normal[1] + R[2][2] * normal[2]; + means_different_p = 1; + diag_eps_p = DIAG_SYMMETRIC_MATRIX(eps_mean); + maxwell_sym_matrix_invert(&eps_mean_inv, &eps_mean); #if !defined(SCALAR_COMPLEX) && 0 /* check inversion symmetry */ - { - symmetric_matrix eps_mean2, eps_inv_mean2; - real normal2[3], r2[3], nc[3]; - r2[0] = n1 == 0 ? r[0] : 1.0 - r[0]; - r2[1] = n2 == 0 ? r[1] : 1.0 - r[1]; - r2[2] = n3 == 0 ? r[2] : 1.0 - r[2]; - CHECK(mepsilon(&eps_mean2, &eps_inv_mean2, normal2, - s1, s2, s3, mesh_prod_inv, - r2, epsilon_data), - "mepsilon symmetry is broken"); - CHECK(sym_matrix_eq(eps_mean,eps_mean2,1e-10) && - sym_matrix_eq(eps_inv_mean,eps_inv_mean2,1e-10), - "inversion symmetry is broken"); - nc[0] = normal[1]*normal2[2] - normal[2]*normal2[1]; - nc[1] = normal[2]*normal2[0] - normal[0]*normal2[2]; - nc[2] = normal[0]*normal2[1] - normal[1]*normal2[0]; - CHECK(sqrt(nc[0]*nc[0]+nc[1]*nc[1]+nc[2]*nc[2])<1e-6, - "normal-vector symmetry is broken"); - } + { + symmetric_matrix eps_mean2, eps_inv_mean2; + real normal2[3], r2[3], nc[3]; + r2[0] = n1 == 0 ? r[0] : 1.0 - r[0]; + r2[1] = n2 == 0 ? r[1] : 1.0 - r[1]; + r2[2] = n3 == 0 ? r[2] : 1.0 - r[2]; + CHECK(mepsilon(&eps_mean2, &eps_inv_mean2, normal2, s1, s2, s3, mesh_prod_inv, r2, + epsilon_data), + "mepsilon symmetry is broken"); + CHECK(sym_matrix_eq(eps_mean, eps_mean2, 1e-10) && + sym_matrix_eq(eps_inv_mean, eps_inv_mean2, 1e-10), + "inversion symmetry is broken"); + nc[0] = normal[1] * normal2[2] - normal[2] * normal2[1]; + nc[1] = normal[2] * normal2[0] - normal[0] * normal2[2]; + nc[2] = normal[0] * normal2[1] - normal[1] * normal2[0]; + CHECK(sqrt(nc[0] * nc[0] + nc[1] * nc[1] + nc[2] * nc[2]) < 1e-6, + "normal-vector symmetry is broken"); + } #endif - goto got_mean; - } - } - - eps_mean.m00 = eps_mean.m11 = eps_mean.m22 = - eps_inv_mean.m00 = eps_inv_mean.m11 = eps_inv_mean.m22 = 0.0; - ASSIGN_ESCALAR(eps_mean.m01, 0,0); - ASSIGN_ESCALAR(eps_mean.m02, 0,0); - ASSIGN_ESCALAR(eps_mean.m12, 0,0); - ASSIGN_ESCALAR(eps_inv_mean.m01, 0,0); - ASSIGN_ESCALAR(eps_inv_mean.m02, 0,0); - ASSIGN_ESCALAR(eps_inv_mean.m12, 0,0); - - for (mi = 0; mi < mesh_size[0]; ++mi) - for (mj = 0; mj < mesh_size[1]; ++mj) - for (mk = 0; mk < mesh_size[2]; ++mk) { - real r[3]; - symmetric_matrix eps, eps_inv; - r[0] = i1 * s1 + (mi - mesh_center[0]) * m1; - r[1] = i2 * s2 + (mj - mesh_center[1]) * m2; - r[2] = i3 * s3 + (mk - mesh_center[2]) * m3; - epsilon(&eps, &eps_inv, r, epsilon_data); - eps_mean.m00 += eps.m00; - eps_mean.m11 += eps.m11; - eps_mean.m22 += eps.m22; - eps_inv_mean.m00 += eps_inv.m00; - eps_inv_mean.m11 += eps_inv.m11; - eps_inv_mean.m22 += eps_inv.m22; + goto got_mean; + } + } + + eps_mean.m00 = eps_mean.m11 = eps_mean.m22 = eps_inv_mean.m00 = eps_inv_mean.m11 = + eps_inv_mean.m22 = 0.0; + ASSIGN_ESCALAR(eps_mean.m01, 0, 0); + ASSIGN_ESCALAR(eps_mean.m02, 0, 0); + ASSIGN_ESCALAR(eps_mean.m12, 0, 0); + ASSIGN_ESCALAR(eps_inv_mean.m01, 0, 0); + ASSIGN_ESCALAR(eps_inv_mean.m02, 0, 0); + ASSIGN_ESCALAR(eps_inv_mean.m12, 0, 0); + + for (mi = 0; mi < mesh_size[0]; ++mi) + for (mj = 0; mj < mesh_size[1]; ++mj) + for (mk = 0; mk < mesh_size[2]; ++mk) { + real r[3]; + symmetric_matrix eps, eps_inv; + r[0] = i1 * s1 + (mi - mesh_center[0]) * m1; + r[1] = i2 * s2 + (mj - mesh_center[1]) * m2; + r[2] = i3 * s3 + (mk - mesh_center[2]) * m3; + epsilon(&eps, &eps_inv, r, epsilon_data); + eps_mean.m00 += eps.m00; + eps_mean.m11 += eps.m11; + eps_mean.m22 += eps.m22; + eps_inv_mean.m00 += eps_inv.m00; + eps_inv_mean.m11 += eps_inv.m11; + eps_inv_mean.m22 += eps_inv.m22; #ifdef WITH_HERMITIAN_EPSILON - CACCUMULATE_SUM(eps_mean.m01, eps.m01); - CACCUMULATE_SUM(eps_mean.m02, eps.m02); - CACCUMULATE_SUM(eps_mean.m12, eps.m12); - CACCUMULATE_SUM(eps_inv_mean.m01, eps_inv.m01); - CACCUMULATE_SUM(eps_inv_mean.m02, eps_inv.m02); - CACCUMULATE_SUM(eps_inv_mean.m12, eps_inv.m12); + CACCUMULATE_SUM(eps_mean.m01, eps.m01); + CACCUMULATE_SUM(eps_mean.m02, eps.m02); + CACCUMULATE_SUM(eps_mean.m12, eps.m12); + CACCUMULATE_SUM(eps_inv_mean.m01, eps_inv.m01); + CACCUMULATE_SUM(eps_inv_mean.m02, eps_inv.m02); + CACCUMULATE_SUM(eps_inv_mean.m12, eps_inv.m12); #else - eps_mean.m01 += eps.m01; - eps_mean.m02 += eps.m02; - eps_mean.m12 += eps.m12; - eps_inv_mean.m01 += eps_inv.m01; - eps_inv_mean.m02 += eps_inv.m02; - eps_inv_mean.m12 += eps_inv.m12; + eps_mean.m01 += eps.m01; + eps_mean.m02 += eps.m02; + eps_mean.m12 += eps.m12; + eps_inv_mean.m01 += eps_inv.m01; + eps_inv_mean.m02 += eps_inv.m02; + eps_inv_mean.m12 += eps_inv.m12; #endif - } + } - diag_eps_p = DIAG_SYMMETRIC_MATRIX(eps_mean); - if (diag_eps_p) { /* handle the common case of diagonal matrices: */ - eps_mean_inv.m00 = mesh_prod / eps_mean.m00; - eps_mean_inv.m11 = mesh_prod / eps_mean.m11; - eps_mean_inv.m22 = mesh_prod / eps_mean.m22; + diag_eps_p = DIAG_SYMMETRIC_MATRIX(eps_mean); + if (diag_eps_p) { /* handle the common case of diagonal matrices: */ + eps_mean_inv.m00 = mesh_prod / eps_mean.m00; + eps_mean_inv.m11 = mesh_prod / eps_mean.m11; + eps_mean_inv.m22 = mesh_prod / eps_mean.m22; #ifdef WITH_HERMITIAN_EPSILON - CASSIGN_ZERO(eps_mean_inv.m01); - CASSIGN_ZERO(eps_mean_inv.m02); - CASSIGN_ZERO(eps_mean_inv.m12); + CASSIGN_ZERO(eps_mean_inv.m01); + CASSIGN_ZERO(eps_mean_inv.m02); + CASSIGN_ZERO(eps_mean_inv.m12); #else - eps_mean_inv.m01 = eps_mean_inv.m02 = eps_mean_inv.m12 = 0.0; + eps_mean_inv.m01 = eps_mean_inv.m02 = eps_mean_inv.m12 = 0.0; #endif - eps_inv_mean.m00 *= mesh_prod_inv; - eps_inv_mean.m11 *= mesh_prod_inv; - eps_inv_mean.m22 *= mesh_prod_inv; - - means_different_p = - fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL || - fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL || - fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL; - } - else { - eps_inv_mean.m00 *= mesh_prod_inv; - eps_inv_mean.m11 *= mesh_prod_inv; - eps_inv_mean.m22 *= mesh_prod_inv; - eps_mean.m00 *= mesh_prod_inv; - eps_mean.m11 *= mesh_prod_inv; - eps_mean.m22 *= mesh_prod_inv; + eps_inv_mean.m00 *= mesh_prod_inv; + eps_inv_mean.m11 *= mesh_prod_inv; + eps_inv_mean.m22 *= mesh_prod_inv; + + means_different_p = fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL || + fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL || + fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL; + } + else { + eps_inv_mean.m00 *= mesh_prod_inv; + eps_inv_mean.m11 *= mesh_prod_inv; + eps_inv_mean.m22 *= mesh_prod_inv; + eps_mean.m00 *= mesh_prod_inv; + eps_mean.m11 *= mesh_prod_inv; + eps_mean.m22 *= mesh_prod_inv; #ifdef WITH_HERMITIAN_EPSILON - eps_mean.m01.re *= mesh_prod_inv; - eps_mean.m01.im *= mesh_prod_inv; - eps_mean.m02.re *= mesh_prod_inv; - eps_mean.m02.im *= mesh_prod_inv; - eps_mean.m12.re *= mesh_prod_inv; - eps_mean.m12.im *= mesh_prod_inv; - eps_inv_mean.m01.re *= mesh_prod_inv; - eps_inv_mean.m01.im *= mesh_prod_inv; - eps_inv_mean.m02.re *= mesh_prod_inv; - eps_inv_mean.m02.im *= mesh_prod_inv; - eps_inv_mean.m12.re *= mesh_prod_inv; - eps_inv_mean.m12.im *= mesh_prod_inv; + eps_mean.m01.re *= mesh_prod_inv; + eps_mean.m01.im *= mesh_prod_inv; + eps_mean.m02.re *= mesh_prod_inv; + eps_mean.m02.im *= mesh_prod_inv; + eps_mean.m12.re *= mesh_prod_inv; + eps_mean.m12.im *= mesh_prod_inv; + eps_inv_mean.m01.re *= mesh_prod_inv; + eps_inv_mean.m01.im *= mesh_prod_inv; + eps_inv_mean.m02.re *= mesh_prod_inv; + eps_inv_mean.m02.im *= mesh_prod_inv; + eps_inv_mean.m12.re *= mesh_prod_inv; + eps_inv_mean.m12.im *= mesh_prod_inv; #else - eps_mean.m01 *= mesh_prod_inv; - eps_mean.m02 *= mesh_prod_inv; - eps_mean.m12 *= mesh_prod_inv; - eps_inv_mean.m01 *= mesh_prod_inv; - eps_inv_mean.m02 *= mesh_prod_inv; - eps_inv_mean.m12 *= mesh_prod_inv; + eps_mean.m01 *= mesh_prod_inv; + eps_mean.m02 *= mesh_prod_inv; + eps_mean.m12 *= mesh_prod_inv; + eps_inv_mean.m01 *= mesh_prod_inv; + eps_inv_mean.m02 *= mesh_prod_inv; + eps_inv_mean.m12 *= mesh_prod_inv; #endif - maxwell_sym_matrix_invert(&eps_mean_inv, &eps_mean); + maxwell_sym_matrix_invert(&eps_mean_inv, &eps_mean); - means_different_p = - fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL || - fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL || - fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL; + means_different_p = fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL || + fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL || + fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL; #ifdef WITH_HERMITIAN_EPSILON - means_different_p = means_different_p || - fabs(eps_mean_inv.m01.re - eps_inv_mean.m01.re) > SMALL || - fabs(eps_mean_inv.m02.re - eps_inv_mean.m02.re) > SMALL || - fabs(eps_mean_inv.m12.re - eps_inv_mean.m12.re) > SMALL || - fabs(eps_mean_inv.m01.im - eps_inv_mean.m01.im) > SMALL || - fabs(eps_mean_inv.m02.im - eps_inv_mean.m02.im) > SMALL || - fabs(eps_mean_inv.m12.im - eps_inv_mean.m12.im) > SMALL; + means_different_p = means_different_p || + fabs(eps_mean_inv.m01.re - eps_inv_mean.m01.re) > SMALL || + fabs(eps_mean_inv.m02.re - eps_inv_mean.m02.re) > SMALL || + fabs(eps_mean_inv.m12.re - eps_inv_mean.m12.re) > SMALL || + fabs(eps_mean_inv.m01.im - eps_inv_mean.m01.im) > SMALL || + fabs(eps_mean_inv.m02.im - eps_inv_mean.m02.im) > SMALL || + fabs(eps_mean_inv.m12.im - eps_inv_mean.m12.im) > SMALL; #else - means_different_p = means_different_p || - fabs(eps_mean_inv.m01 - eps_inv_mean.m01) > SMALL || - fabs(eps_mean_inv.m02 - eps_inv_mean.m02) > SMALL || - fabs(eps_mean_inv.m12 - eps_inv_mean.m12) > SMALL; + means_different_p = means_different_p || fabs(eps_mean_inv.m01 - eps_inv_mean.m01) > SMALL || + fabs(eps_mean_inv.m02 - eps_inv_mean.m02) > SMALL || + fabs(eps_mean_inv.m12 - eps_inv_mean.m12) > SMALL; #endif - } - - /* if the two averaging methods yielded different results, - which usually happens if epsilon is not constant, then - we need to find the normal vector to the dielectric interface: */ - if (means_different_p) { - real moment0 = 0, moment1 = 0, moment2 = 0; - - for (mi = 0; mi < size_moment_mesh; ++mi) { - real r[3], eps_trace; - symmetric_matrix eps, eps_inv; - r[0] = i1 * s1 + moment_mesh[mi][0]; - r[1] = i2 * s2 + moment_mesh[mi][1]; - r[2] = i3 * s3 + moment_mesh[mi][2]; - epsilon(&eps, &eps_inv, r, epsilon_data); - eps_trace = eps.m00 + eps.m11 + eps.m22; - eps_trace *= moment_mesh_weights[mi]; - moment0 += eps_trace * moment_mesh[mi][0]; - moment1 += eps_trace * moment_mesh[mi][1]; - moment2 += eps_trace * moment_mesh[mi][2]; - } - - /* need to convert moment from lattice to cartesian coords: */ - norm0 = R[0][0]*moment0 + R[1][0]*moment1 + R[2][0]*moment2; - norm1 = R[0][1]*moment0 + R[1][1]*moment1 + R[2][1]*moment2; - norm2 = R[0][2]*moment0 + R[1][2]*moment1 + R[2][2]*moment2; - - got_mean: - - norm_len = sqrt(norm0*norm0 + norm1*norm1 + norm2*norm2); - } - - if (means_different_p && norm_len > SMALL) { - real x0, x1, x2; - - norm_len = 1.0/norm_len; - norm0 *= norm_len; - norm1 *= norm_len; - norm2 *= norm_len; - - /* Compute the effective inverse dielectric tensor. - We define this as: - 1/2 ( {eps_inv_mean, P} + {eps_mean_inv, 1-P} ) - where P is the projection matrix onto the normal direction - (P = norm ^ norm), and {a,b} is the anti-commutator ab+ba. - = 1/2 {eps_inv_mean - eps_mean_inv, P} + eps_mean_inv - = 1/2 (n_i conj(x_j) + x_i n_j) + (eps_mean_inv)_ij - where n_k is the kth component of the normal vector and - x_i = (eps_inv_mean - eps_mean_inv)_ik n_k - Note the implied summations (Einstein notation). - - Note that the resulting matrix is symmetric, and we get just - eps_inv_mean if eps_inv_mean == eps_mean_inv, as desired. - - Note that P is idempotent, so for scalar epsilon this - is just eps_inv_mean * P + eps_mean_inv * (1-P) - = (1/eps_inv_mean * P + eps_mean * (1-P)) ^ (-1), - which corresponds to the expression in the Meade paper. */ - - x0 = (eps_inv_mean.m00 - eps_mean_inv.m00) * norm0; - x1 = (eps_inv_mean.m11 - eps_mean_inv.m11) * norm1; - x2 = (eps_inv_mean.m22 - eps_mean_inv.m22) * norm2; - if (diag_eps_p) { + } + + /* if the two averaging methods yielded different results, + which usually happens if epsilon is not constant, then + we need to find the normal vector to the dielectric interface: */ + if (means_different_p) { + real moment0 = 0, moment1 = 0, moment2 = 0; + + for (mi = 0; mi < size_moment_mesh; ++mi) { + real r[3], eps_trace; + symmetric_matrix eps, eps_inv; + r[0] = i1 * s1 + moment_mesh[mi][0]; + r[1] = i2 * s2 + moment_mesh[mi][1]; + r[2] = i3 * s3 + moment_mesh[mi][2]; + epsilon(&eps, &eps_inv, r, epsilon_data); + eps_trace = eps.m00 + eps.m11 + eps.m22; + eps_trace *= moment_mesh_weights[mi]; + moment0 += eps_trace * moment_mesh[mi][0]; + moment1 += eps_trace * moment_mesh[mi][1]; + moment2 += eps_trace * moment_mesh[mi][2]; + } + + /* need to convert moment from lattice to cartesian coords: */ + norm0 = R[0][0] * moment0 + R[1][0] * moment1 + R[2][0] * moment2; + norm1 = R[0][1] * moment0 + R[1][1] * moment1 + R[2][1] * moment2; + norm2 = R[0][2] * moment0 + R[1][2] * moment1 + R[2][2] * moment2; + + got_mean: + + norm_len = sqrt(norm0 * norm0 + norm1 * norm1 + norm2 * norm2); + } + + if (means_different_p && norm_len > SMALL) { + real x0, x1, x2; + + norm_len = 1.0 / norm_len; + norm0 *= norm_len; + norm1 *= norm_len; + norm2 *= norm_len; + + /* Compute the effective inverse dielectric tensor. + We define this as: + 1/2 ( {eps_inv_mean, P} + {eps_mean_inv, 1-P} ) + where P is the projection matrix onto the normal direction + (P = norm ^ norm), and {a,b} is the anti-commutator ab+ba. + = 1/2 {eps_inv_mean - eps_mean_inv, P} + eps_mean_inv + = 1/2 (n_i conj(x_j) + x_i n_j) + (eps_mean_inv)_ij + where n_k is the kth component of the normal vector and + x_i = (eps_inv_mean - eps_mean_inv)_ik n_k + Note the implied summations (Einstein notation). + + Note that the resulting matrix is symmetric, and we get just + eps_inv_mean if eps_inv_mean == eps_mean_inv, as desired. + + Note that P is idempotent, so for scalar epsilon this + is just eps_inv_mean * P + eps_mean_inv * (1-P) + = (1/eps_inv_mean * P + eps_mean * (1-P)) ^ (-1), + which corresponds to the expression in the Meade paper. */ + + x0 = (eps_inv_mean.m00 - eps_mean_inv.m00) * norm0; + x1 = (eps_inv_mean.m11 - eps_mean_inv.m11) * norm1; + x2 = (eps_inv_mean.m22 - eps_mean_inv.m22) * norm2; + if (diag_eps_p) { #ifdef WITH_HERMITIAN_EPSILON - md->eps_inv[xyz_index].m01.re = 0.5*(x0*norm1 + x1*norm0); - md->eps_inv[xyz_index].m01.im = 0.0; - md->eps_inv[xyz_index].m02.re = 0.5*(x0*norm2 + x2*norm0); - md->eps_inv[xyz_index].m02.im = 0.0; - md->eps_inv[xyz_index].m12.re = 0.5*(x1*norm2 + x2*norm1); - md->eps_inv[xyz_index].m12.im = 0.0; + md->eps_inv[xyz_index].m01.re = 0.5 * (x0 * norm1 + x1 * norm0); + md->eps_inv[xyz_index].m01.im = 0.0; + md->eps_inv[xyz_index].m02.re = 0.5 * (x0 * norm2 + x2 * norm0); + md->eps_inv[xyz_index].m02.im = 0.0; + md->eps_inv[xyz_index].m12.re = 0.5 * (x1 * norm2 + x2 * norm1); + md->eps_inv[xyz_index].m12.im = 0.0; #else - md->eps_inv[xyz_index].m01 = 0.5*(x0*norm1 + x1*norm0); - md->eps_inv[xyz_index].m02 = 0.5*(x0*norm2 + x2*norm0); - md->eps_inv[xyz_index].m12 = 0.5*(x1*norm2 + x2*norm1); + md->eps_inv[xyz_index].m01 = 0.5 * (x0 * norm1 + x1 * norm0); + md->eps_inv[xyz_index].m02 = 0.5 * (x0 * norm2 + x2 * norm0); + md->eps_inv[xyz_index].m12 = 0.5 * (x1 * norm2 + x2 * norm1); #endif - } - else { + } + else { #ifdef WITH_HERMITIAN_EPSILON - real x0i, x1i, x2i; - x0 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re)*norm1 + - (eps_inv_mean.m02.re - eps_mean_inv.m02.re)*norm2); - x1 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re)*norm0 + - (eps_inv_mean.m12.re - eps_mean_inv.m12.re)*norm2); - x2 += ((eps_inv_mean.m02.re - eps_mean_inv.m02.re)*norm0 + - (eps_inv_mean.m12.re - eps_mean_inv.m12.re)*norm1); - x0i = ((eps_inv_mean.m01.im - eps_mean_inv.m01.im)*norm1 + - (eps_inv_mean.m02.im - eps_mean_inv.m02.im)*norm2); - x1i = (-(eps_inv_mean.m01.im - eps_mean_inv.m01.im)*norm0+ - (eps_inv_mean.m12.im - eps_mean_inv.m12.im)*norm2); - x2i = -((eps_inv_mean.m02.im - eps_mean_inv.m02.im)*norm0 + - (eps_inv_mean.m12.im - eps_mean_inv.m12.im)*norm1); - - md->eps_inv[xyz_index].m01.re = (0.5*(x0*norm1 + x1*norm0) - + eps_mean_inv.m01.re); - md->eps_inv[xyz_index].m02.re = (0.5*(x0*norm2 + x2*norm0) - + eps_mean_inv.m02.re); - md->eps_inv[xyz_index].m12.re = (0.5*(x1*norm2 + x2*norm1) - + eps_mean_inv.m12.re); - md->eps_inv[xyz_index].m01.im = (0.5*(x0i*norm1-x1i*norm0) - + eps_mean_inv.m01.im); - md->eps_inv[xyz_index].m02.im = (0.5*(x0i*norm2-x2i*norm0) - + eps_mean_inv.m02.im); - md->eps_inv[xyz_index].m12.im = (0.5*(x1i*norm2-x2i*norm1) - + eps_mean_inv.m12.im); + real x0i, x1i, x2i; + x0 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re) * norm1 + + (eps_inv_mean.m02.re - eps_mean_inv.m02.re) * norm2); + x1 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re) * norm0 + + (eps_inv_mean.m12.re - eps_mean_inv.m12.re) * norm2); + x2 += ((eps_inv_mean.m02.re - eps_mean_inv.m02.re) * norm0 + + (eps_inv_mean.m12.re - eps_mean_inv.m12.re) * norm1); + x0i = ((eps_inv_mean.m01.im - eps_mean_inv.m01.im) * norm1 + + (eps_inv_mean.m02.im - eps_mean_inv.m02.im) * norm2); + x1i = (-(eps_inv_mean.m01.im - eps_mean_inv.m01.im) * norm0 + + (eps_inv_mean.m12.im - eps_mean_inv.m12.im) * norm2); + x2i = -((eps_inv_mean.m02.im - eps_mean_inv.m02.im) * norm0 + + (eps_inv_mean.m12.im - eps_mean_inv.m12.im) * norm1); + + md->eps_inv[xyz_index].m01.re = (0.5 * (x0 * norm1 + x1 * norm0) + eps_mean_inv.m01.re); + md->eps_inv[xyz_index].m02.re = (0.5 * (x0 * norm2 + x2 * norm0) + eps_mean_inv.m02.re); + md->eps_inv[xyz_index].m12.re = (0.5 * (x1 * norm2 + x2 * norm1) + eps_mean_inv.m12.re); + md->eps_inv[xyz_index].m01.im = (0.5 * (x0i * norm1 - x1i * norm0) + eps_mean_inv.m01.im); + md->eps_inv[xyz_index].m02.im = (0.5 * (x0i * norm2 - x2i * norm0) + eps_mean_inv.m02.im); + md->eps_inv[xyz_index].m12.im = (0.5 * (x1i * norm2 - x2i * norm1) + eps_mean_inv.m12.im); #else - x0 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm1 + - (eps_inv_mean.m02 - eps_mean_inv.m02) * norm2); - x1 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm0 + - (eps_inv_mean.m12 - eps_mean_inv.m12) * norm2); - x2 += ((eps_inv_mean.m02 - eps_mean_inv.m02) * norm0 + - (eps_inv_mean.m12 - eps_mean_inv.m12) * norm1); - - md->eps_inv[xyz_index].m01 = (0.5*(x0*norm1 + x1*norm0) - + eps_mean_inv.m01); - md->eps_inv[xyz_index].m02 = (0.5*(x0*norm2 + x2*norm0) - + eps_mean_inv.m02); - md->eps_inv[xyz_index].m12 = (0.5*(x1*norm2 + x2*norm1) - + eps_mean_inv.m12); + x0 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm1 + + (eps_inv_mean.m02 - eps_mean_inv.m02) * norm2); + x1 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm0 + + (eps_inv_mean.m12 - eps_mean_inv.m12) * norm2); + x2 += ((eps_inv_mean.m02 - eps_mean_inv.m02) * norm0 + + (eps_inv_mean.m12 - eps_mean_inv.m12) * norm1); + + md->eps_inv[xyz_index].m01 = (0.5 * (x0 * norm1 + x1 * norm0) + eps_mean_inv.m01); + md->eps_inv[xyz_index].m02 = (0.5 * (x0 * norm2 + x2 * norm0) + eps_mean_inv.m02); + md->eps_inv[xyz_index].m12 = (0.5 * (x1 * norm2 + x2 * norm1) + eps_mean_inv.m12); #endif - } - md->eps_inv[xyz_index].m00 = x0*norm0 + eps_mean_inv.m00; - md->eps_inv[xyz_index].m11 = x1*norm1 + eps_mean_inv.m11; - md->eps_inv[xyz_index].m22 = x2*norm2 + eps_mean_inv.m22; - } - else { /* undetermined normal vector and/or constant eps */ - md->eps_inv[xyz_index] = eps_mean_inv; - } - got_eps_inv: - - eps_inv_total += (md->eps_inv[xyz_index].m00 + - md->eps_inv[xyz_index].m11 + - md->eps_inv[xyz_index].m22); - }}} /* end of loop body */ - - mpi_allreduce_1(&eps_inv_total, real, SCALAR_MPI_TYPE, - MPI_SUM, mpb_comm); - n1 = md->fft_output_size; - mpi_allreduce_1(&n1, int, MPI_INT, MPI_SUM, mpb_comm); - md->eps_inv_mean = eps_inv_total / (3 * n1); + } + md->eps_inv[xyz_index].m00 = x0 * norm0 + eps_mean_inv.m00; + md->eps_inv[xyz_index].m11 = x1 * norm1 + eps_mean_inv.m11; + md->eps_inv[xyz_index].m22 = x2 * norm2 + eps_mean_inv.m22; + } + else { /* undetermined normal vector and/or constant eps */ + md->eps_inv[xyz_index] = eps_mean_inv; + } + got_eps_inv: + + eps_inv_total += + (md->eps_inv[xyz_index].m00 + md->eps_inv[xyz_index].m11 + md->eps_inv[xyz_index].m22); + } +} +} /* end of loop body */ + +mpi_allreduce_1(&eps_inv_total, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); +n1 = md->fft_output_size; +mpi_allreduce_1(&n1, int, MPI_INT, MPI_SUM, mpb_comm); +md->eps_inv_mean = eps_inv_total / (3 * n1); } -void set_maxwell_mu(maxwell_data *md, - const int mesh_size[3], - real R[3][3], real G[3][3], - maxwell_dielectric_function mu, - maxwell_dielectric_mean_function mmu, +void set_maxwell_mu(maxwell_data *md, const int mesh_size[3], real R[3][3], real G[3][3], + maxwell_dielectric_function mu, maxwell_dielectric_mean_function mmu, void *mu_data) { - symmetric_matrix *eps_inv = md->eps_inv; - real eps_inv_mean = md->eps_inv_mean; - if (md->mu_inv == NULL) { - CHK_MALLOC(md->mu_inv, symmetric_matrix, md->fft_output_size); - } - /* just re-use code to set epsilon, but initialize mu_inv instead */ - md->eps_inv = md->mu_inv; - set_maxwell_dielectric(md, mesh_size, R, G, mu, mmu, mu_data); - md->eps_inv = eps_inv; - md->mu_inv_mean = md->eps_inv_mean; - md->eps_inv_mean = eps_inv_mean; + symmetric_matrix *eps_inv = md->eps_inv; + real eps_inv_mean = md->eps_inv_mean; + if (md->mu_inv == NULL) { CHK_MALLOC(md->mu_inv, symmetric_matrix, md->fft_output_size); } + /* just re-use code to set epsilon, but initialize mu_inv instead */ + md->eps_inv = md->mu_inv; + set_maxwell_dielectric(md, mesh_size, R, G, mu, mmu, mu_data); + md->eps_inv = eps_inv; + md->mu_inv_mean = md->eps_inv_mean; + md->eps_inv_mean = eps_inv_mean; } diff --git a/src/maxwell/maxwell_op.c b/src/maxwell/maxwell_op.c index bd8bb762..a7fe861d 100644 --- a/src/maxwell/maxwell_op.c +++ b/src/maxwell/maxwell_op.c @@ -24,318 +24,264 @@ /**************************************************************************/ -/* assign a = v going from transverse to cartesian coordinates. +/* assign a = v going from transverse to cartesian coordinates. Here, a = (a[0],a[1],a[2]) is in cartesian coordinates. (v[0],v[vstride]) is in the transverse basis of k.m and k.n. */ -static void assign_t2c(scalar *a, const k_data k, - const scalar *v, int vstride) -{ - scalar v0 = v[0], v1 = v[vstride]; - - ASSIGN_SCALAR(a[0], - SCALAR_RE(v0)*k.mx + SCALAR_RE(v1)*k.nx, - SCALAR_IM(v0)*k.mx + SCALAR_IM(v1)*k.nx); - ASSIGN_SCALAR(a[1], - SCALAR_RE(v0)*k.my + SCALAR_RE(v1)*k.ny, - SCALAR_IM(v0)*k.my + SCALAR_IM(v1)*k.ny); - ASSIGN_SCALAR(a[2], - SCALAR_RE(v0)*k.mz + SCALAR_RE(v1)*k.nz, - SCALAR_IM(v0)*k.mz + SCALAR_IM(v1)*k.nz); +static void assign_t2c(scalar *a, const k_data k, const scalar *v, int vstride) { + scalar v0 = v[0], v1 = v[vstride]; + + ASSIGN_SCALAR(a[0], SCALAR_RE(v0) * k.mx + SCALAR_RE(v1) * k.nx, + SCALAR_IM(v0) * k.mx + SCALAR_IM(v1) * k.nx); + ASSIGN_SCALAR(a[1], SCALAR_RE(v0) * k.my + SCALAR_RE(v1) * k.ny, + SCALAR_IM(v0) * k.my + SCALAR_IM(v1) * k.ny); + ASSIGN_SCALAR(a[2], SCALAR_RE(v0) * k.mz + SCALAR_RE(v1) * k.nz, + SCALAR_IM(v0) * k.mz + SCALAR_IM(v1) * k.nz); } /* project from cartesian to transverse coordinates (inverse of assign_t2c) */ -static void project_c2t(scalar *v, int vstride, const k_data k, - const scalar *a, real scale) -{ - real ax_r=SCALAR_RE(a[0]), ay_r=SCALAR_RE(a[1]), az_r=SCALAR_RE(a[2]); - real ax_i=SCALAR_IM(a[0]), ay_i=SCALAR_IM(a[1]), az_i=SCALAR_IM(a[2]); - ASSIGN_SCALAR(v[0], (ax_r*k.mx + ay_r*k.my + az_r*k.mz) * scale, - (ax_i*k.mx + ay_i*k.my +az_i*k.mz) * scale); - ASSIGN_SCALAR(v[vstride], (ax_r*k.nx + ay_r*k.ny + az_r*k.nz) * scale, - (ax_i*k.nx + ay_i*k.ny +az_i*k.nz) * scale); +static void project_c2t(scalar *v, int vstride, const k_data k, const scalar *a, real scale) { + real ax_r = SCALAR_RE(a[0]), ay_r = SCALAR_RE(a[1]), az_r = SCALAR_RE(a[2]); + real ax_i = SCALAR_IM(a[0]), ay_i = SCALAR_IM(a[1]), az_i = SCALAR_IM(a[2]); + ASSIGN_SCALAR(v[0], (ax_r * k.mx + ay_r * k.my + az_r * k.mz) * scale, + (ax_i * k.mx + ay_i * k.my + az_i * k.mz) * scale); + ASSIGN_SCALAR(v[vstride], (ax_r * k.nx + ay_r * k.ny + az_r * k.nz) * scale, + (ax_i * k.nx + ay_i * k.ny + az_i * k.nz) * scale); } /* assign a = k x v (cross product), going from transverse to cartesian coordinates. - + Here, a = (a[0],a[1],a[2]) and k = (k.kx,k.ky,k.kz) are in cartesian coordinates. (v[0],v[vstride]) is in the transverse basis of k.m and k.n. */ -static void assign_cross_t2c(scalar *a, const k_data k, - const scalar *v, int vstride) -{ - scalar v0 = v[0], v1 = v[vstride]; - - /* Note that k x m = |k| n, k x n = - |k| m. Therefore, - k x v = k x (v0 m + v1 n) = (v0 n - v1 m) * |k|. */ - - ASSIGN_SCALAR(a[0], - (SCALAR_RE(v0)*k.nx - SCALAR_RE(v1)*k.mx) * k.kmag, - (SCALAR_IM(v0)*k.nx - SCALAR_IM(v1)*k.mx) * k.kmag); - ASSIGN_SCALAR(a[1], - (SCALAR_RE(v0)*k.ny - SCALAR_RE(v1)*k.my) * k.kmag, - (SCALAR_IM(v0)*k.ny - SCALAR_IM(v1)*k.my) * k.kmag); - ASSIGN_SCALAR(a[2], - (SCALAR_RE(v0)*k.nz - SCALAR_RE(v1)*k.mz) * k.kmag, - (SCALAR_IM(v0)*k.nz - SCALAR_IM(v1)*k.mz) * k.kmag); +static void assign_cross_t2c(scalar *a, const k_data k, const scalar *v, int vstride) { + scalar v0 = v[0], v1 = v[vstride]; + + /* Note that k x m = |k| n, k x n = - |k| m. Therefore, + k x v = k x (v0 m + v1 n) = (v0 n - v1 m) * |k|. */ + + ASSIGN_SCALAR(a[0], (SCALAR_RE(v0) * k.nx - SCALAR_RE(v1) * k.mx) * k.kmag, + (SCALAR_IM(v0) * k.nx - SCALAR_IM(v1) * k.mx) * k.kmag); + ASSIGN_SCALAR(a[1], (SCALAR_RE(v0) * k.ny - SCALAR_RE(v1) * k.my) * k.kmag, + (SCALAR_IM(v0) * k.ny - SCALAR_IM(v1) * k.my) * k.kmag); + ASSIGN_SCALAR(a[2], (SCALAR_RE(v0) * k.nz - SCALAR_RE(v1) * k.mz) * k.kmag, + (SCALAR_IM(v0) * k.nz - SCALAR_IM(v1) * k.mz) * k.kmag); #ifdef DEBUG - { - real num; - num = SCALAR_NORMSQR(a[0])+SCALAR_NORMSQR(a[1])+SCALAR_NORMSQR(a[2]); - CHECK(!BADNUM(num), "yikes, crazy number!"); - } + { + real num; + num = SCALAR_NORMSQR(a[0]) + SCALAR_NORMSQR(a[1]) + SCALAR_NORMSQR(a[2]); + CHECK(!BADNUM(num), "yikes, crazy number!"); + } #endif } /* assign v = scale * k x a (cross product), going from cartesian to transverse coordinates. - + Here, a = (a[0],a[1],a[2]) and k = (k.kx,k.ky,k.kz) are in cartesian coordinates. (v[0],v[vstride]) is in the transverse basis of k.m and k.n. */ -static void assign_cross_c2t(scalar *v, int vstride, - const k_data k, const scalar *a, - real scale) -{ - scalar a0 = a[0], a1 = a[1], a2 = a[2]; - scalar at0, at1; - - /* First, compute at0 = a*m and at1 = a*n. (Components of a that - are parallel to k are killed anyway by the cross product.) */ - - ASSIGN_SCALAR(at0, - SCALAR_RE(a0)*k.mx + SCALAR_RE(a1)*k.my + SCALAR_RE(a2)*k.mz, - SCALAR_IM(a0)*k.mx + SCALAR_IM(a1)*k.my + SCALAR_IM(a2)*k.mz); - ASSIGN_SCALAR(at1, - SCALAR_RE(a0)*k.nx + SCALAR_RE(a1)*k.ny + SCALAR_RE(a2)*k.nz, - SCALAR_IM(a0)*k.nx + SCALAR_IM(a1)*k.ny + SCALAR_IM(a2)*k.nz); - - /* Now, k x a = k x (at0*m + at1*n) = (at0*n - at1*m) * |k|. */ - - scale *= k.kmag; /* combine scale factor and |k|*/ - ASSIGN_SCALAR(v[0], - - scale * SCALAR_RE(at1), - - scale * SCALAR_IM(at1)); - ASSIGN_SCALAR(v[vstride], - scale * SCALAR_RE(at0), - scale * SCALAR_IM(at0)); +static void assign_cross_c2t(scalar *v, int vstride, const k_data k, const scalar *a, real scale) { + scalar a0 = a[0], a1 = a[1], a2 = a[2]; + scalar at0, at1; + + /* First, compute at0 = a*m and at1 = a*n. (Components of a that + are parallel to k are killed anyway by the cross product.) */ + + ASSIGN_SCALAR(at0, SCALAR_RE(a0) * k.mx + SCALAR_RE(a1) * k.my + SCALAR_RE(a2) * k.mz, + SCALAR_IM(a0) * k.mx + SCALAR_IM(a1) * k.my + SCALAR_IM(a2) * k.mz); + ASSIGN_SCALAR(at1, SCALAR_RE(a0) * k.nx + SCALAR_RE(a1) * k.ny + SCALAR_RE(a2) * k.nz, + SCALAR_IM(a0) * k.nx + SCALAR_IM(a1) * k.ny + SCALAR_IM(a2) * k.nz); + + /* Now, k x a = k x (at0*m + at1*n) = (at0*n - at1*m) * |k|. */ + + scale *= k.kmag; /* combine scale factor and |k|*/ + ASSIGN_SCALAR(v[0], -scale * SCALAR_RE(at1), -scale * SCALAR_IM(at1)); + ASSIGN_SCALAR(v[vstride], scale * SCALAR_RE(at0), scale * SCALAR_IM(at0)); #ifdef DEBUG - { - real dummy = SCALAR_NORMSQR(v[0]) + SCALAR_NORMSQR(v[vstride]); - CHECK(!BADNUM(dummy), "yikes, crazy number!"); - } + { + real dummy = SCALAR_NORMSQR(v[0]) + SCALAR_NORMSQR(v[vstride]); + CHECK(!BADNUM(dummy), "yikes, crazy number!"); + } #endif } /* compute a = u x v, where a and u are in cartesian coordinates and v is in transverse coordinates. */ -static void assign_ucross_t2c(scalar *a, const real u[3], const k_data k, - const scalar *v, int vstride) -{ - scalar v0 = v[0], v1 = v[vstride]; - real vx_r, vy_r, vz_r; +static void assign_ucross_t2c(scalar *a, const real u[3], const k_data k, const scalar *v, + int vstride) { + scalar v0 = v[0], v1 = v[vstride]; + real vx_r, vy_r, vz_r; #ifdef SCALAR_COMPLEX - real vx_i, vy_i, vz_i; + real vx_i, vy_i, vz_i; #endif - /* Note that v = (vx,vy,vz) = (v0 m + v1 n). */ + /* Note that v = (vx,vy,vz) = (v0 m + v1 n). */ - vx_r = SCALAR_RE(v0)*k.mx + SCALAR_RE(v1)*k.nx; - vy_r = SCALAR_RE(v0)*k.my + SCALAR_RE(v1)*k.ny; - vz_r = SCALAR_RE(v0)*k.mz + SCALAR_RE(v1)*k.nz; + vx_r = SCALAR_RE(v0) * k.mx + SCALAR_RE(v1) * k.nx; + vy_r = SCALAR_RE(v0) * k.my + SCALAR_RE(v1) * k.ny; + vz_r = SCALAR_RE(v0) * k.mz + SCALAR_RE(v1) * k.nz; #ifdef SCALAR_COMPLEX - vx_i = SCALAR_IM(v0)*k.mx + SCALAR_IM(v1)*k.nx; - vy_i = SCALAR_IM(v0)*k.my + SCALAR_IM(v1)*k.ny; - vz_i = SCALAR_IM(v0)*k.mz + SCALAR_IM(v1)*k.nz; + vx_i = SCALAR_IM(v0) * k.mx + SCALAR_IM(v1) * k.nx; + vy_i = SCALAR_IM(v0) * k.my + SCALAR_IM(v1) * k.ny; + vz_i = SCALAR_IM(v0) * k.mz + SCALAR_IM(v1) * k.nz; #endif - ASSIGN_SCALAR(a[0], - u[1] * vz_r - u[2] * vy_r, - u[1] * vz_i - u[2] * vy_i); - ASSIGN_SCALAR(a[1], - u[2] * vx_r - u[0] * vz_r, - u[2] * vx_i - u[0] * vz_i); - ASSIGN_SCALAR(a[2], - u[0] * vy_r - u[1] * vx_r, - u[0] * vy_i - u[1] * vx_i); + ASSIGN_SCALAR(a[0], u[1] * vz_r - u[2] * vy_r, u[1] * vz_i - u[2] * vy_i); + ASSIGN_SCALAR(a[1], u[2] * vx_r - u[0] * vz_r, u[2] * vx_i - u[0] * vz_i); + ASSIGN_SCALAR(a[2], u[0] * vy_r - u[1] * vx_r, u[0] * vy_i - u[1] * vx_i); } /**************************************************************************/ -void maxwell_compute_fft(int dir, maxwell_data *d, - scalar *array_in, scalar *array_out, - int howmany, int stride, int dist) -{ +void maxwell_compute_fft(int dir, maxwell_data *d, scalar *array_in, scalar *array_out, int howmany, + int stride, int dist) { #if defined(HAVE_FFTW3) - FFTW(plan) plan, iplan; - FFTW(complex) *carray_in = (FFTW(complex) *) array_in; - real *rarray_in = (real *) array_in; - FFTW(complex) *carray_out = (FFTW(complex) *) array_out; - real *rarray_out = (real *) array_out; - int ip; - for (ip = 0; ip < d->nplans && (howmany != d->plans_howmany[ip] || - stride != d->plans_stride[ip] || - dist != d->plans_dist[ip]); ++ip); - if (ip < d->nplans) { - plan = (FFTW(plan)) d->plans[ip]; - iplan = (FFTW(plan)) d->iplans[ip]; - } - else { /* create new plans */ - ptrdiff_t np[3]; - int n[3]; np[0]=n[0]=d->nx; np[1]=n[1]=d->ny; np[2]=n[2]=d->nz; -# ifdef SCALAR_COMPLEX -# ifdef HAVE_MPI - CHECK(stride==howmany && dist==1, "bug: unsupported stride/dist"); - plan = FFTW(mpi_plan_many_dft)(3, np, howmany, - FFTW_MPI_DEFAULT_BLOCK, - FFTW_MPI_DEFAULT_BLOCK, - carray_in, carray_out, - mpb_comm, FFTW_BACKWARD, - FFTW_ESTIMATE - | FFTW_MPI_TRANSPOSED_IN); - iplan = FFTW(mpi_plan_many_dft)(3, np, howmany, - FFTW_MPI_DEFAULT_BLOCK, - FFTW_MPI_DEFAULT_BLOCK, - carray_in, carray_out, - mpb_comm, FFTW_FORWARD, - FFTW_ESTIMATE - | FFTW_MPI_TRANSPOSED_OUT); -# else /* !HAVE_MPI */ - plan = FFTW(plan_many_dft)(3, n, howmany, carray_in, 0, stride, dist, - carray_out, 0, stride, dist, - FFTW_BACKWARD, FFTW_ESTIMATE); - iplan = FFTW(plan_many_dft)(3, n, howmany, carray_in,0,stride, dist, - carray_out, 0, stride, dist, - FFTW_FORWARD, FFTW_ESTIMATE); -# endif /* !HAVE_MPI */ -# else /* !SCALAR_COMPLEX */ - { - int rnk = n[2] != 1 ? 3 : (n[1] != 1 ? 2 : 1); - int nr[3]; nr[0] = n[0]; nr[1] = n[1]; nr[2] = n[2]; - nr[rnk-1] = 2*(nr[rnk-1]/2 + 1); -# ifdef HAVE_MPI - CHECK(stride==howmany && dist==1, "bug: unsupported stride/dist"); - plan = FFTW(mpi_plan_many_dft_c2r)(rnk, np, howmany, - FFTW_MPI_DEFAULT_BLOCK, - FFTW_MPI_DEFAULT_BLOCK, - carray_in, rarray_out, - mpb_comm, FFTW_ESTIMATE - | FFTW_MPI_TRANSPOSED_IN); - iplan = FFTW(mpi_plan_many_dft_r2c)(rnk, np, howmany, - FFTW_MPI_DEFAULT_BLOCK, - FFTW_MPI_DEFAULT_BLOCK, - rarray_in, carray_out, - mpb_comm, FFTW_ESTIMATE - | FFTW_MPI_TRANSPOSED_OUT); -# else /* !HAVE_MPI */ - plan = FFTW(plan_many_dft_c2r)(rnk, n, howmany, - carray_in, 0, stride, dist, - rarray_out, nr, stride, dist, - FFTW_ESTIMATE); - iplan = FFTW(plan_many_dft_r2c)(rnk, n, howmany, - rarray_in, nr, stride, dist, - carray_out, 0, stride, dist, - FFTW_ESTIMATE); -# endif /* !HAVE_MPI */ - } -# endif /* !SCALAR_COMPLEX */ - CHECK(plan && iplan, "Failure creating FFTW3 plans"); - } - - /* note that the new-array execute functions should be safe - since we only apply maxwell_compute_fft to fftw_malloc'ed data - (so we don't ever have misaligned arrays), and we check above - that the strides etc. match */ -# ifdef SCALAR_COMPLEX -# ifdef HAVE_MPI - FFTW(mpi_execute_dft)(dir < 0 ? plan : iplan, carray_in, carray_out); -# else /* !HAVE_MPI */ - FFTW(execute_dft)(dir < 0 ? plan : iplan, carray_in, carray_out); -# endif /* !HAVE_MPI */ -# else -# ifdef HAVE_MPI - if (dir > 0) - FFTW(mpi_execute_dft_r2c)(iplan, rarray_in, carray_out); - else - FFTW(mpi_execute_dft_c2r)(plan, carray_in, rarray_out); -# else /* !HAVE_MPI */ - if (dir > 0) - FFTW(execute_dft_r2c)(iplan, rarray_in, carray_out); - else - FFTW(execute_dft_c2r)(plan, carray_in, rarray_out); -# endif /* !HAVE_MPI */ -# endif - - if (ip == MAX_NPLANS) { /* don't store too many plans */ - FFTW(destroy_plan)(plan); - FFTW(destroy_plan)(iplan); - } - else if (ip == d->nplans) { /* save for later re-use */ - d->plans[ip] = plan; - d->iplans[ip] = iplan; - d->plans_howmany[ip] = howmany; - d->plans_stride[ip] = stride; - d->plans_dist[ip] = dist; - d->nplans++; - } + FFTW(plan) plan, iplan; + FFTW(complex) *carray_in = (FFTW(complex) *)array_in; + real *rarray_in = (real *)array_in; + FFTW(complex) *carray_out = (FFTW(complex) *)array_out; + real *rarray_out = (real *)array_out; + int ip; + for (ip = 0; ip < d->nplans && (howmany != d->plans_howmany[ip] || + stride != d->plans_stride[ip] || dist != d->plans_dist[ip]); + ++ip) + ; + if (ip < d->nplans) { + plan = (FFTW(plan))d->plans[ip]; + iplan = (FFTW(plan))d->iplans[ip]; + } + else { /* create new plans */ + ptrdiff_t np[3]; + int n[3]; + np[0] = n[0] = d->nx; + np[1] = n[1] = d->ny; + np[2] = n[2] = d->nz; +#ifdef SCALAR_COMPLEX +#ifdef HAVE_MPI + CHECK(stride == howmany && dist == 1, "bug: unsupported stride/dist"); + plan = FFTW(mpi_plan_many_dft)(3, np, howmany, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + carray_in, carray_out, mpb_comm, FFTW_BACKWARD, + FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); + iplan = FFTW(mpi_plan_many_dft)(3, np, howmany, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + carray_in, carray_out, mpb_comm, FFTW_FORWARD, + FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); +#else /* !HAVE_MPI */ + plan = FFTW(plan_many_dft)(3, n, howmany, carray_in, 0, stride, dist, carray_out, 0, stride, + dist, FFTW_BACKWARD, FFTW_ESTIMATE); + iplan = FFTW(plan_many_dft)(3, n, howmany, carray_in, 0, stride, dist, carray_out, 0, stride, + dist, FFTW_FORWARD, FFTW_ESTIMATE); +#endif /* !HAVE_MPI */ +#else /* !SCALAR_COMPLEX */ + { + int rnk = n[2] != 1 ? 3 : (n[1] != 1 ? 2 : 1); + int nr[3]; + nr[0] = n[0]; + nr[1] = n[1]; + nr[2] = n[2]; + nr[rnk - 1] = 2 * (nr[rnk - 1] / 2 + 1); +#ifdef HAVE_MPI + CHECK(stride == howmany && dist == 1, "bug: unsupported stride/dist"); + plan = FFTW(mpi_plan_many_dft_c2r)(rnk, np, howmany, FFTW_MPI_DEFAULT_BLOCK, + FFTW_MPI_DEFAULT_BLOCK, carray_in, rarray_out, mpb_comm, + FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); + iplan = FFTW(mpi_plan_many_dft_r2c)(rnk, np, howmany, FFTW_MPI_DEFAULT_BLOCK, + FFTW_MPI_DEFAULT_BLOCK, rarray_in, carray_out, mpb_comm, + FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); +#else /* !HAVE_MPI */ + plan = FFTW(plan_many_dft_c2r)(rnk, n, howmany, carray_in, 0, stride, dist, rarray_out, nr, + stride, dist, FFTW_ESTIMATE); + iplan = FFTW(plan_many_dft_r2c)(rnk, n, howmany, rarray_in, nr, stride, dist, carray_out, 0, + stride, dist, FFTW_ESTIMATE); +#endif /* !HAVE_MPI */ + } +#endif /* !SCALAR_COMPLEX */ + CHECK(plan && iplan, "Failure creating FFTW3 plans"); + } + + /* note that the new-array execute functions should be safe + since we only apply maxwell_compute_fft to fftw_malloc'ed data + (so we don't ever have misaligned arrays), and we check above + that the strides etc. match */ +#ifdef SCALAR_COMPLEX +#ifdef HAVE_MPI + FFTW(mpi_execute_dft)(dir < 0 ? plan : iplan, carray_in, carray_out); +#else /* !HAVE_MPI */ + FFTW(execute_dft)(dir < 0 ? plan : iplan, carray_in, carray_out); +#endif /* !HAVE_MPI */ +#else +#ifdef HAVE_MPI + if (dir > 0) + FFTW(mpi_execute_dft_r2c)(iplan, rarray_in, carray_out); + else + FFTW(mpi_execute_dft_c2r)(plan, carray_in, rarray_out); +#else /* !HAVE_MPI */ + if (dir > 0) + FFTW(execute_dft_r2c)(iplan, rarray_in, carray_out); + else + FFTW(execute_dft_c2r)(plan, carray_in, rarray_out); +#endif /* !HAVE_MPI */ +#endif + + if (ip == MAX_NPLANS) { /* don't store too many plans */ + FFTW(destroy_plan)(plan); + FFTW(destroy_plan)(iplan); + } + else if (ip == d->nplans) { /* save for later re-use */ + d->plans[ip] = plan; + d->iplans[ip] = iplan; + d->plans_howmany[ip] = howmany; + d->plans_stride[ip] = stride; + d->plans_dist[ip] = dist; + d->nplans++; + } #elif defined(HAVE_FFTW) - CHECK(array_in == array_out, "only in-place supported with FFTW2"); + CHECK(array_in == array_out, "only in-place supported with FFTW2"); + +#ifdef SCALAR_COMPLEX -# ifdef SCALAR_COMPLEX +#ifndef HAVE_MPI -# ifndef HAVE_MPI + fftwnd((fftplan)(dir < 0 ? d->plans[0] : d->iplans[0]), howmany, (fftw_complex *)array_in, stride, + dist, 0, 0, 0); - fftwnd((fftplan) (dir < 0 ? d->plans[0] : d->iplans[0]), - howmany, - (fftw_complex *) array_in, stride, dist, - 0, 0, 0); +#else /* HAVE_MPI */ -# else /* HAVE_MPI */ + CHECK(stride == howmany && dist == 1, "weird strides and dists don't work with fftwnd_mpi"); - CHECK(stride == howmany && dist == 1, - "weird strides and dists don't work with fftwnd_mpi"); - - fftwnd_mpi((fftplan) (dir < 0 ? d->plans[0] : d->iplans[0]), - howmany, - (fftw_complex *) array_in, (fftw_complex *) NULL, - FFTW_TRANSPOSED_ORDER); + fftwnd_mpi((fftplan)(dir < 0 ? d->plans[0] : d->iplans[0]), howmany, (fftw_complex *)array_in, + (fftw_complex *)NULL, FFTW_TRANSPOSED_ORDER); -# endif /* HAVE_MPI */ +#endif /* HAVE_MPI */ -# else /* not SCALAR_COMPLEX */ +#else /* not SCALAR_COMPLEX */ -# ifndef HAVE_MPI +#ifndef HAVE_MPI - if (dir > 0) - rfftwnd_real_to_complex((fftplan) (d->iplans[0]), - howmany, - (fftw_real *) array_in, stride, dist, - 0, 0, 0); - else - rfftwnd_complex_to_real((fftplan) (d->plans[0]), - howmany, - (fftw_complex *) array_in, stride, dist, - 0, 0, 0); + if (dir > 0) + rfftwnd_real_to_complex((fftplan)(d->iplans[0]), howmany, (fftw_real *)array_in, stride, dist, + 0, 0, 0); + else + rfftwnd_complex_to_real((fftplan)(d->plans[0]), howmany, (fftw_complex *)array_in, stride, dist, + 0, 0, 0); -# else /* HAVE_MPI */ +#else /* HAVE_MPI */ - CHECK(stride == howmany && dist == 1, - "weird strides and dists don't work with rfftwnd_mpi"); - - rfftwnd_mpi((fftplan) (dir < 0 ? d->plans[0] : d->iplans[0]), - howmany, array_in, (scalar *) NULL, - FFTW_TRANSPOSED_ORDER); + CHECK(stride == howmany && dist == 1, "weird strides and dists don't work with rfftwnd_mpi"); -# endif /* HAVE_MPI */ + rfftwnd_mpi((fftplan)(dir < 0 ? d->plans[0] : d->iplans[0]), howmany, array_in, (scalar *)NULL, + FFTW_TRANSPOSED_ORDER); -# endif /* not SCALAR_COMPLEX */ +#endif /* HAVE_MPI */ + +#endif /* not SCALAR_COMPLEX */ #else /* not HAVE_FFTW */ -# error only FFTW ffts are supported +#error only FFTW ffts are supported #endif /* not HAVE_FFTW */ } @@ -343,45 +289,42 @@ void maxwell_compute_fft(int dir, maxwell_data *d, /* assigns newv = matrix * oldv. matrix is symmetric and so is stored in "packed" format. */ -void assign_symmatrix_vector(scalar_complex *newv, - const symmetric_matrix matrix, - const scalar_complex *oldv) -{ - scalar_complex v0 = oldv[0], v1 = oldv[1], v2 = oldv[2]; +void assign_symmatrix_vector(scalar_complex *newv, const symmetric_matrix matrix, + const scalar_complex *oldv) { + scalar_complex v0 = oldv[0], v1 = oldv[1], v2 = oldv[2]; #if defined(WITH_HERMITIAN_EPSILON) - newv[0].re = matrix.m00 * v0.re; - newv[0].im = matrix.m00 * v0.im; - CACCUMULATE_SUM_MULT(newv[0], matrix.m01, v1); - CACCUMULATE_SUM_MULT(newv[0], matrix.m02, v2); - - newv[1].re = matrix.m11 * v1.re; - newv[1].im = matrix.m11 * v1.im; - CACCUMULATE_SUM_CONJ_MULT(newv[1], matrix.m01, v0); - CACCUMULATE_SUM_MULT(newv[1], matrix.m12, v2); - - newv[2].re = matrix.m22 * v2.re; - newv[2].im = matrix.m22 * v2.im; - CACCUMULATE_SUM_CONJ_MULT(newv[2], matrix.m02, v0); - CACCUMULATE_SUM_CONJ_MULT(newv[2], matrix.m12, v1); + newv[0].re = matrix.m00 * v0.re; + newv[0].im = matrix.m00 * v0.im; + CACCUMULATE_SUM_MULT(newv[0], matrix.m01, v1); + CACCUMULATE_SUM_MULT(newv[0], matrix.m02, v2); + + newv[1].re = matrix.m11 * v1.re; + newv[1].im = matrix.m11 * v1.im; + CACCUMULATE_SUM_CONJ_MULT(newv[1], matrix.m01, v0); + CACCUMULATE_SUM_MULT(newv[1], matrix.m12, v2); + + newv[2].re = matrix.m22 * v2.re; + newv[2].im = matrix.m22 * v2.im; + CACCUMULATE_SUM_CONJ_MULT(newv[2], matrix.m02, v0); + CACCUMULATE_SUM_CONJ_MULT(newv[2], matrix.m12, v1); #else - newv[0].re = matrix.m00 * v0.re + matrix.m01 * v1.re + matrix.m02 * v2.re; - newv[0].im = matrix.m00 * v0.im + matrix.m01 * v1.im + matrix.m02 * v2.im; + newv[0].re = matrix.m00 * v0.re + matrix.m01 * v1.re + matrix.m02 * v2.re; + newv[0].im = matrix.m00 * v0.im + matrix.m01 * v1.im + matrix.m02 * v2.im; - newv[1].re = matrix.m01 * v0.re + matrix.m11 * v1.re + matrix.m12 * v2.re; - newv[1].im = matrix.m01 * v0.im + matrix.m11 * v1.im + matrix.m12 * v2.im; + newv[1].re = matrix.m01 * v0.re + matrix.m11 * v1.re + matrix.m12 * v2.re; + newv[1].im = matrix.m01 * v0.im + matrix.m11 * v1.im + matrix.m12 * v2.im; - newv[2].re = matrix.m02 * v0.re + matrix.m12 * v1.re + matrix.m22 * v2.re; - newv[2].im = matrix.m02 * v0.im + matrix.m12 * v1.im + matrix.m22 * v2.im; + newv[2].re = matrix.m02 * v0.re + matrix.m12 * v1.re + matrix.m22 * v2.re; + newv[2].im = matrix.m02 * v0.im + matrix.m12 * v1.im + matrix.m22 * v2.im; #endif #ifdef DEBUG - { - real dummy; - dummy = CSCALAR_NORMSQR(newv[0]) + CSCALAR_NORMSQR(newv[1]) - + CSCALAR_NORMSQR(newv[2]); - CHECK(!BADNUM(dummy), "yikes, crazy number!"); - } + { + real dummy; + dummy = CSCALAR_NORMSQR(newv[0]) + CSCALAR_NORMSQR(newv[1]) + CSCALAR_NORMSQR(newv[2]); + CHECK(!BADNUM(dummy), "yikes, crazy number!"); + } #endif } @@ -390,198 +333,169 @@ void assign_symmatrix_vector(scalar_complex *newv, taking the curl and then Fourier transforming. The output array, dfield, is fft_output_size x cur_num_bands x 3, where fft_output_size is the local spatial indices and 3 is the field - components. + components. Note: actually, this computes just (k+G) x H, whereas the actual D field is i/omega i(k+G) x H...so, we are really computing -omega*D, here. */ -void maxwell_compute_d_from_H(maxwell_data *d, evectmatrix Hin, - scalar_complex *dfield, - int cur_band_start, int cur_num_bands) -{ - scalar *fft_data = (scalar *) dfield; - scalar *fft_data_in = d->fft_data2 == d->fft_data ? fft_data : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); - int i, j, b; - - CHECK(Hin.c == 2, "fields don't have 2 components!"); - CHECK(d, "null maxwell data pointer!"); - CHECK(dfield, "null field output data!"); - CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hin.p, - "invalid range of bands for computing fields"); - - /* first, compute fft_data = curl(Hin) (really (k+G) x H) : */ - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_cross_t2c(&fft_data_in[3 * (ij2*cur_num_bands - + b)], - cur_k, - &Hin.data[ij * 2 * Hin.p + - b + cur_band_start], - Hin.p); - } - - /* now, convert to position space via FFT: */ - maxwell_compute_fft(+1, d, fft_data_in, fft_data, - cur_num_bands*3, cur_num_bands*3, 1); +void maxwell_compute_d_from_H(maxwell_data *d, evectmatrix Hin, scalar_complex *dfield, + int cur_band_start, int cur_num_bands) { + scalar *fft_data = (scalar *)dfield; + scalar *fft_data_in = d->fft_data2 == d->fft_data + ? fft_data + : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); + int i, j, b; + + CHECK(Hin.c == 2, "fields don't have 2 components!"); + CHECK(d, "null maxwell data pointer!"); + CHECK(dfield, "null field output data!"); + CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hin.p, + "invalid range of bands for computing fields"); + + /* first, compute fft_data = curl(Hin) (really (k+G) x H) : */ + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_cross_t2c(&fft_data_in[3 * (ij2 * cur_num_bands + b)], cur_k, + &Hin.data[ij * 2 * Hin.p + b + cur_band_start], Hin.p); + } + + /* now, convert to position space via FFT: */ + maxwell_compute_fft(+1, d, fft_data_in, fft_data, cur_num_bands * 3, cur_num_bands * 3, 1); } /* Compute E (output in dfield) from D (input in dfield); this amounts to just dividing by the dielectric tensor. dfield is in position space and corresponds to the output from maxwell_compute_d_from_H, above. */ -void maxwell_compute_e_from_d_(maxwell_data *d, - scalar_complex *dfield, - int cur_num_bands, - symmetric_matrix *eps_inv_) -{ - int i, b; - - CHECK(d, "null maxwell data pointer!"); - CHECK(dfield, "null field input/output data!"); - - for (i = 0; i < d->fft_output_size; ++i) { - symmetric_matrix eps_inv = eps_inv_[i]; - for (b = 0; b < cur_num_bands; ++b) { - int ib = 3 * (i * cur_num_bands + b); - assign_symmatrix_vector(&dfield[ib], eps_inv, &dfield[ib]); - } - } +void maxwell_compute_e_from_d_(maxwell_data *d, scalar_complex *dfield, int cur_num_bands, + symmetric_matrix *eps_inv_) { + int i, b; + + CHECK(d, "null maxwell data pointer!"); + CHECK(dfield, "null field input/output data!"); + + for (i = 0; i < d->fft_output_size; ++i) { + symmetric_matrix eps_inv = eps_inv_[i]; + for (b = 0; b < cur_num_bands; ++b) { + int ib = 3 * (i * cur_num_bands + b); + assign_symmatrix_vector(&dfield[ib], eps_inv, &dfield[ib]); + } + } } -void maxwell_compute_e_from_d(maxwell_data *d, - scalar_complex *dfield, - int cur_num_bands) -{ - maxwell_compute_e_from_d_(d, dfield, cur_num_bands, d->eps_inv); +void maxwell_compute_e_from_d(maxwell_data *d, scalar_complex *dfield, int cur_num_bands) { + maxwell_compute_e_from_d_(d, dfield, cur_num_bands, d->eps_inv); } /* Compute the magnetic (H) field in Fourier space from the electric field (e) in position space; this amounts to Fourier transforming and then taking the curl. Also, multiply by scale. Other - parameters are as in compute_d_from_H. + parameters are as in compute_d_from_H. Note: we actually compute (k+G) x E, whereas the actual H field is -i/omega i(k+G) x E...so, we are actually computing omega*H, here. */ -void maxwell_compute_H_from_e(maxwell_data *d, evectmatrix Hout, - scalar_complex *efield, - int cur_band_start, int cur_num_bands, - real scale) -{ - scalar *fft_data = (scalar *) efield; - scalar *fft_data_out = d->fft_data2 == d->fft_data ? fft_data : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); - int i, j, b; - - CHECK(Hout.c == 2, "fields don't have 2 components!"); - CHECK(d, "null maxwell data pointer!"); - CHECK(efield, "null field output data!"); - CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hout.p, - "invalid range of bands for computing fields"); - - /* convert back to Fourier space */ - maxwell_compute_fft(-1, d, fft_data, fft_data_out, - cur_num_bands*3, cur_num_bands*3, 1); - - /* then, compute Hout = curl(fft_data) (* scale factor): */ - - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_cross_c2t(&Hout.data[ij * 2 * Hout.p + - b + cur_band_start], - Hout.p, cur_k, - &fft_data_out[3 * (ij2*cur_num_bands+b)], - scale); - } +void maxwell_compute_H_from_e(maxwell_data *d, evectmatrix Hout, scalar_complex *efield, + int cur_band_start, int cur_num_bands, real scale) { + scalar *fft_data = (scalar *)efield; + scalar *fft_data_out = d->fft_data2 == d->fft_data + ? fft_data + : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); + int i, j, b; + + CHECK(Hout.c == 2, "fields don't have 2 components!"); + CHECK(d, "null maxwell data pointer!"); + CHECK(efield, "null field output data!"); + CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hout.p, + "invalid range of bands for computing fields"); + + /* convert back to Fourier space */ + maxwell_compute_fft(-1, d, fft_data, fft_data_out, cur_num_bands * 3, cur_num_bands * 3, 1); + + /* then, compute Hout = curl(fft_data) (* scale factor): */ + + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_cross_c2t(&Hout.data[ij * 2 * Hout.p + b + cur_band_start], Hout.p, cur_k, + &fft_data_out[3 * (ij2 * cur_num_bands + b)], scale); + } } - /* Compute H field in position space from Hin. Parameters and output formats are the same as for compute_d_from_H, above. */ -void maxwell_compute_h_from_H(maxwell_data *d, evectmatrix Hin, - scalar_complex *hfield, - int cur_band_start, int cur_num_bands) -{ - scalar *fft_data = (scalar *) hfield; - scalar *fft_data_in = d->fft_data2 == d->fft_data ? fft_data : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); - int i, j, b; - - CHECK(Hin.c == 2, "fields don't have 2 components!"); - CHECK(d, "null maxwell data pointer!"); - CHECK(hfield, "null field output data!"); - CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hin.p, - "invalid range of bands for computing fields"); - - /* first, compute fft_data = Hin, with the vector field converted - from transverse to cartesian basis: */ - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_t2c(&fft_data_in[3 * (ij2*cur_num_bands - + b)], - cur_k, - &Hin.data[ij * 2 * Hin.p + - b + cur_band_start], - Hin.p); - } - - /* now, convert to position space via FFT: */ - maxwell_compute_fft(+1, d, fft_data_in, fft_data, - cur_num_bands*3, cur_num_bands*3, 1); +void maxwell_compute_h_from_H(maxwell_data *d, evectmatrix Hin, scalar_complex *hfield, + int cur_band_start, int cur_num_bands) { + scalar *fft_data = (scalar *)hfield; + scalar *fft_data_in = d->fft_data2 == d->fft_data + ? fft_data + : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); + int i, j, b; + + CHECK(Hin.c == 2, "fields don't have 2 components!"); + CHECK(d, "null maxwell data pointer!"); + CHECK(hfield, "null field output data!"); + CHECK(cur_band_start >= 0 && cur_band_start + cur_num_bands <= Hin.p, + "invalid range of bands for computing fields"); + + /* first, compute fft_data = Hin, with the vector field converted + from transverse to cartesian basis: */ + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_t2c(&fft_data_in[3 * (ij2 * cur_num_bands + b)], cur_k, + &Hin.data[ij * 2 * Hin.p + b + cur_band_start], Hin.p); + } + + /* now, convert to position space via FFT: */ + maxwell_compute_fft(+1, d, fft_data_in, fft_data, cur_num_bands * 3, cur_num_bands * 3, 1); } -void maxwell_compute_H_from_B(maxwell_data *d, evectmatrix Bin, - evectmatrix Hout, scalar_complex *hfield, - int Bin_band_start, int Hout_band_start, - int cur_num_bands) -{ - scalar *fft_data = (scalar *) hfield; - scalar *fft_data_out = d->fft_data2 == d->fft_data ? fft_data : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); - int i, j, b; - real scale = 1.0 / Hout.N; /* scale factor to normalize FFTs */ - - if (d->mu_inv == NULL) { - if (Bin.data != Hout.data) - evectmatrix_copy_slice(Hout, Bin, - Hout_band_start, Bin_band_start, - cur_num_bands); - return; - } - - maxwell_compute_h_from_H(d, Bin, hfield, Bin_band_start, cur_num_bands); - maxwell_compute_e_from_d_(d, hfield, cur_num_bands, d->mu_inv); - - /* convert back to Fourier space */ - maxwell_compute_fft(-1, d, fft_data, fft_data_out, - cur_num_bands*3, cur_num_bands*3, 1); - - /* then, compute Hout = (transverse component)(fft_data) * scale factor */ - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - for (b = 0; b < cur_num_bands; ++b) - project_c2t(&Hout.data[ij * 2 * Hout.p + - b + Hout_band_start], - Hout.p, cur_k, - &fft_data_out[3 * (ij2*cur_num_bands+b)], - scale); - } +void maxwell_compute_H_from_B(maxwell_data *d, evectmatrix Bin, evectmatrix Hout, + scalar_complex *hfield, int Bin_band_start, int Hout_band_start, + int cur_num_bands) { + scalar *fft_data = (scalar *)hfield; + scalar *fft_data_out = d->fft_data2 == d->fft_data + ? fft_data + : (fft_data == d->fft_data ? d->fft_data2 : d->fft_data); + int i, j, b; + real scale = 1.0 / Hout.N; /* scale factor to normalize FFTs */ + + if (d->mu_inv == NULL) { + if (Bin.data != Hout.data) + evectmatrix_copy_slice(Hout, Bin, Hout_band_start, Bin_band_start, cur_num_bands); + return; + } + + maxwell_compute_h_from_H(d, Bin, hfield, Bin_band_start, cur_num_bands); + maxwell_compute_e_from_d_(d, hfield, cur_num_bands, d->mu_inv); + + /* convert back to Fourier space */ + maxwell_compute_fft(-1, d, fft_data, fft_data_out, cur_num_bands * 3, cur_num_bands * 3, 1); + + /* then, compute Hout = (transverse component)(fft_data) * scale factor */ + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + for (b = 0; b < cur_num_bands; ++b) + project_c2t(&Hout.data[ij * 2 * Hout.p + b + Hout_band_start], Hout.p, cur_k, + &fft_data_out[3 * (ij2 * cur_num_bands + b)], scale); + } } - /**************************************************************************/ /* The following functions take a complex or real vector field @@ -634,403 +548,439 @@ void maxwell_compute_H_from_B(maxwell_data *d, evectmatrix Bin, other "half." phase{x,y,z} is the phase k*R{x,y,z}, in "units" of 2*pi. (Equivalently, phase{x,y,z} is the k vector in the reciprocal lattice basis.) */ -void maxwell_vectorfield_otherhalf(maxwell_data *d, scalar_complex *field, - real phasex, real phasey, real phasez) -{ +void maxwell_vectorfield_otherhalf(maxwell_data *d, scalar_complex *field, real phasex, real phasey, + real phasez) { #ifndef SCALAR_COMPLEX - int i, j, jmin = 1; - int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; -# ifdef HAVE_MPI - int local_x_start; -# endif - scalar_complex pz, pxz, pyz, pxyz; - - nxmax = nx = d->nx; ny = d->ny; nz = d->nz; - n_other = d->other_dims; - n_last = d->last_dim; - n_last_stored = d->last_dim_size / 2; - n_last_new = n_last - n_last_stored; /* < n_last_stored always */ - rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; - -# ifdef HAVE_MPI - local_x_start = d->local_y_start; - CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); - if (rank == 2) { - n_other = nx; - n_last_new = ny = d->local_ny; - if (local_x_start == 0) - --n_last_new; /* DC frequency should not be in other half */ - else - jmin = 0; - if (local_x_start + ny == n_last_stored && n_last % 2 == 0) - --n_last_new; /* Nyquist freq. should not be in other half */ - n_last_stored = ny; - } - else { /* rank == 3 */ - ny = nx; - nx = d->local_ny; - nxmax = local_x_start ? nx - 1 : nx; - n_other = nx * ny; - } -# endif /* HAVE_MPI */ - - /* compute p = exp(i*phase) factors: */ - phasex *= -TWOPI; phasey *= -TWOPI; phasez *= -TWOPI; - switch (rank) { /* treat z as the last/truncated dimension always */ - case 3: break; -# if defined(HAVE_MPI) && ! defined(SCALAR_COMPLEX) - case 2: phasez = phasex; phasex = phasey; phasey = 0; break; -# else - case 2: phasez = phasey; phasey = 0; break; -# endif - case 1: phasez = phasex; phasex = phasey = 0; break; - } - CASSIGN_SCALAR(pz, cos(phasez), sin(phasez)); - phasex += phasez; - CASSIGN_SCALAR(pxz, cos(phasex), sin(phasex)); - phasex += phasey; - CASSIGN_SCALAR(pxyz, cos(phasex), sin(phasex)); - phasey += phasez; - CASSIGN_SCALAR(pyz, cos(phasey), sin(phasey)); - -/* convenience macros to copy vectors, vectors times phases, + int i, j, jmin = 1; + int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; +#ifdef HAVE_MPI + int local_x_start; +#endif + scalar_complex pz, pxz, pyz, pxyz; + + nxmax = nx = d->nx; + ny = d->ny; + nz = d->nz; + n_other = d->other_dims; + n_last = d->last_dim; + n_last_stored = d->last_dim_size / 2; + n_last_new = n_last - n_last_stored; /* < n_last_stored always */ + rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; + +#ifdef HAVE_MPI + local_x_start = d->local_y_start; + CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); + if (rank == 2) { + n_other = nx; + n_last_new = ny = d->local_ny; + if (local_x_start == 0) + --n_last_new; /* DC frequency should not be in other half */ + else + jmin = 0; + if (local_x_start + ny == n_last_stored && n_last % 2 == 0) + --n_last_new; /* Nyquist freq. should not be in other half */ + n_last_stored = ny; + } + else { /* rank == 3 */ + ny = nx; + nx = d->local_ny; + nxmax = local_x_start ? nx - 1 : nx; + n_other = nx * ny; + } +#endif /* HAVE_MPI */ + + /* compute p = exp(i*phase) factors: */ + phasex *= -TWOPI; + phasey *= -TWOPI; + phasez *= -TWOPI; + switch (rank) { /* treat z as the last/truncated dimension always */ + case 3: break; +#if defined(HAVE_MPI) && !defined(SCALAR_COMPLEX) + case 2: + phasez = phasex; + phasex = phasey; + phasey = 0; + break; +#else + case 2: + phasez = phasey; + phasey = 0; + break; +#endif + case 1: + phasez = phasex; + phasex = phasey = 0; + break; + } + CASSIGN_SCALAR(pz, cos(phasez), sin(phasez)); + phasex += phasez; + CASSIGN_SCALAR(pxz, cos(phasex), sin(phasex)); + phasex += phasey; + CASSIGN_SCALAR(pxyz, cos(phasex), sin(phasex)); + phasey += phasez; + CASSIGN_SCALAR(pyz, cos(phasey), sin(phasey)); + +/* convenience macros to copy vectors, vectors times phases, and conjugated vectors: */ -# define ASSIGN_V(f,k,f2,k2) { f[3*(k)+0] = f2[3*(k2)+0]; \ - f[3*(k)+1] = f2[3*(k2)+1]; \ - f[3*(k)+2] = f2[3*(k2)+2]; } -# define ASSIGN_VP(f,k,f2,k2,p) { CASSIGN_MULT(f[3*(k)+0], f2[3*(k2)+0], p); \ - CASSIGN_MULT(f[3*(k)+1], f2[3*(k2)+1], p); \ - CASSIGN_MULT(f[3*(k)+2], f2[3*(k2)+2], p); } -# define ASSIGN_CV(f,k,f2,k2) { CASSIGN_CONJ(f[3*(k)+0], f2[3*(k2)+0]); \ - CASSIGN_CONJ(f[3*(k)+1], f2[3*(k2)+1]); \ - CASSIGN_CONJ(f[3*(k)+2], f2[3*(k2)+2]); } - - /* First, swap the order of elements and multiply by exp(ikR) - phase factors. We have to be careful here not to double-swap - any element pair; this is prevented by never swapping with a - "conjugated" point that is earlier in the array. */ - - if (rank == 3) { - int ix, iy; - for (ix = 0; 2*ix <= nxmax; ++ix) { - int xdiff, ixc; -# ifdef HAVE_MPI - if (local_x_start == 0) { - xdiff = ix != 0; ixc = (nx - ix) % nx; - } - else { - xdiff = 1; ixc = nx-1 - ix; - } -# else - xdiff = ix != 0; ixc = (nx - ix) % nx; -# endif - for (iy = 0; iy < ny; ++iy) { - int ydiff = iy != 0; - int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; - if (ic < i) - continue; - jmax = n_last_new; - if (ic == i) - jmax = (jmax + 1) / 2; - for (j = 1; j <= jmax; ++j) { - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; - scalar_complex f_tmp[3]; - switch (xdiff*2 + ydiff) { /* pick exp(-ikR) phase */ - case 3: /* xdiff && ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pxyz); - ASSIGN_VP(field, ijc, field, ij, pxyz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 2: /* xdiff && !ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pxz); - ASSIGN_VP(field, ijc, field, ij, pxz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 1: /* !xdiff && ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pyz); - ASSIGN_VP(field, ijc, field, ij, pyz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 0: /* !xdiff && !ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pz); - ASSIGN_VP(field, ijc, field, ij, pz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - } - } - } - } - - /* Next, conjugate, and remove the holes from the array - corresponding to the DC and Nyquist frequencies (which were in - the first half already): */ - for (i = 0; i < n_other; ++i) - for (j = 1; j < n_last_new + 1; ++j) { - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; - ASSIGN_CV(field, ijnew, field, ij); - } - } - else /* if (rank <= 2) */ { - int i; - if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ - nx = 1; /* x dimension is handled by j (last dimension) loop */ - -# ifdef HAVE_MPI - for (i = 0; i < nx; ++i) -# else - for (i = 0; 2*i <= nx; ++i) -# endif - { - int xdiff = i != 0, ic = (nx - i) % nx; - int jmax = n_last_new + (jmin - 1); -# ifndef HAVE_MPI - if (ic == i) - jmax = (jmax + 1) / 2; -# endif - for (j = jmin; j <= jmax; ++j) { - scalar_complex f_tmp[3]; -# ifdef HAVE_MPI - int jc = jmax + jmin - j; - int ij = j * nx + i; - int ijc = jc * nx + ic; - if (ijc < ij) - break; -# else /* ! HAVE_MPI */ - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; -# endif /* ! HAVE_MPI */ - if (xdiff) { - ASSIGN_VP(f_tmp, 0, field, ijc, pxz); - ASSIGN_VP(field, ijc, field, ij, pxz); - ASSIGN_V(field, ij, f_tmp, 0); - } - else { - ASSIGN_VP(f_tmp, 0, field, ijc, pz); - ASSIGN_VP(field, ijc, field, ij, pz); - ASSIGN_V(field, ij, f_tmp, 0); - } - } - } - - /* Next, conjugate, and remove the holes from the array - corresponding to the DC and Nyquist frequencies (which were in - the first half already): */ - for (i = 0; i < nx; ++i) - for (j = jmin; j < n_last_new + jmin; ++j) { -# ifdef HAVE_MPI - int ij = j*nx + i, ijnew = (j-jmin)*nx + i; -# else - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; -# endif - ASSIGN_CV(field, ijnew, field, ij); - } - } - -# undef ASSIGN_V -# undef ASSIGN_VP -# undef ASSIGN_CV +#define ASSIGN_V(f, k, f2, k2) \ + { \ + f[3 * (k) + 0] = f2[3 * (k2) + 0]; \ + f[3 * (k) + 1] = f2[3 * (k2) + 1]; \ + f[3 * (k) + 2] = f2[3 * (k2) + 2]; \ + } +#define ASSIGN_VP(f, k, f2, k2, p) \ + { \ + CASSIGN_MULT(f[3 * (k) + 0], f2[3 * (k2) + 0], p); \ + CASSIGN_MULT(f[3 * (k) + 1], f2[3 * (k2) + 1], p); \ + CASSIGN_MULT(f[3 * (k) + 2], f2[3 * (k2) + 2], p); \ + } +#define ASSIGN_CV(f, k, f2, k2) \ + { \ + CASSIGN_CONJ(f[3 * (k) + 0], f2[3 * (k2) + 0]); \ + CASSIGN_CONJ(f[3 * (k) + 1], f2[3 * (k2) + 1]); \ + CASSIGN_CONJ(f[3 * (k) + 2], f2[3 * (k2) + 2]); \ + } + + /* First, swap the order of elements and multiply by exp(ikR) + phase factors. We have to be careful here not to double-swap + any element pair; this is prevented by never swapping with a + "conjugated" point that is earlier in the array. */ + + if (rank == 3) { + int ix, iy; + for (ix = 0; 2 * ix <= nxmax; ++ix) { + int xdiff, ixc; +#ifdef HAVE_MPI + if (local_x_start == 0) { + xdiff = ix != 0; + ixc = (nx - ix) % nx; + } + else { + xdiff = 1; + ixc = nx - 1 - ix; + } +#else + xdiff = ix != 0; + ixc = (nx - ix) % nx; +#endif + for (iy = 0; iy < ny; ++iy) { + int ydiff = iy != 0; + int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; + if (ic < i) continue; + jmax = n_last_new; + if (ic == i) jmax = (jmax + 1) / 2; + for (j = 1; j <= jmax; ++j) { + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; + scalar_complex f_tmp[3]; + switch (xdiff * 2 + ydiff) { /* pick exp(-ikR) phase */ + case 3: /* xdiff && ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pxyz); + ASSIGN_VP(field, ijc, field, ij, pxyz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 2: /* xdiff && !ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pxz); + ASSIGN_VP(field, ijc, field, ij, pxz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 1: /* !xdiff && ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pyz); + ASSIGN_VP(field, ijc, field, ij, pyz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 0: /* !xdiff && !ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pz); + ASSIGN_VP(field, ijc, field, ij, pz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + } + } + } + } + + /* Next, conjugate, and remove the holes from the array + corresponding to the DC and Nyquist frequencies (which were in + the first half already): */ + for (i = 0; i < n_other; ++i) + for (j = 1; j < n_last_new + 1; ++j) { + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; + ASSIGN_CV(field, ijnew, field, ij); + } + } + else /* if (rank <= 2) */ { + int i; + if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ + nx = 1; /* x dimension is handled by j (last dimension) loop */ + +#ifdef HAVE_MPI + for (i = 0; i < nx; ++i) +#else + for (i = 0; 2 * i <= nx; ++i) +#endif + { + int xdiff = i != 0, ic = (nx - i) % nx; + int jmax = n_last_new + (jmin - 1); +#ifndef HAVE_MPI + if (ic == i) jmax = (jmax + 1) / 2; +#endif + for (j = jmin; j <= jmax; ++j) { + scalar_complex f_tmp[3]; +#ifdef HAVE_MPI + int jc = jmax + jmin - j; + int ij = j * nx + i; + int ijc = jc * nx + ic; + if (ijc < ij) break; +#else /* ! HAVE_MPI */ + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; +#endif /* ! HAVE_MPI */ + if (xdiff) { + ASSIGN_VP(f_tmp, 0, field, ijc, pxz); + ASSIGN_VP(field, ijc, field, ij, pxz); + ASSIGN_V(field, ij, f_tmp, 0); + } + else { + ASSIGN_VP(f_tmp, 0, field, ijc, pz); + ASSIGN_VP(field, ijc, field, ij, pz); + ASSIGN_V(field, ij, f_tmp, 0); + } + } + } + + /* Next, conjugate, and remove the holes from the array + corresponding to the DC and Nyquist frequencies (which were in + the first half already): */ + for (i = 0; i < nx; ++i) + for (j = jmin; j < n_last_new + jmin; ++j) { +#ifdef HAVE_MPI + int ij = j * nx + i, ijnew = (j - jmin) * nx + i; +#else + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; +#endif + ASSIGN_CV(field, ijnew, field, ij); + } + } + +#undef ASSIGN_V +#undef ASSIGN_VP +#undef ASSIGN_CV #endif /* ! SCALAR_COMPLEX */ } -/* as vectorfield_otherhalf, but operates on a complex scalar field +/* as vectorfield_otherhalf, but operates on a complex scalar field ... ugh, copy & paste job */ -void maxwell_cscalarfield_otherhalf(maxwell_data *d, scalar_complex *field, - real phasex, real phasey, real phasez) -{ +void maxwell_cscalarfield_otherhalf(maxwell_data *d, scalar_complex *field, real phasex, + real phasey, real phasez) { #ifndef SCALAR_COMPLEX - int i, j, jmin = 1; - int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; -# ifdef HAVE_MPI - int local_x_start; -# endif - scalar_complex pz, pxz, pyz, pxyz; - - nxmax = nx = d->nx; ny = d->ny; nz = d->nz; - n_other = d->other_dims; - n_last = d->last_dim; - n_last_stored = d->last_dim_size / 2; - n_last_new = n_last - n_last_stored; /* < n_last_stored always */ - rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; - -# ifdef HAVE_MPI - local_x_start = d->local_y_start; - CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); - if (rank == 2) { - n_other = nx; - n_last_new = ny = d->local_ny; - if (local_x_start == 0) - --n_last_new; /* DC frequency should not be in other half */ - else - jmin = 0; - if (local_x_start + ny == n_last_stored && n_last % 2 == 0) - --n_last_new; /* Nyquist freq. should not be in other half */ - n_last_stored = ny; - } - else { /* rank == 3 */ - ny = nx; - nx = d->local_ny; - nxmax = local_x_start ? nx - 1 : nx; - n_other = nx * ny; - } -# endif /* HAVE_MPI */ - - /* compute p = exp(i*phase) factors: */ - phasex *= -TWOPI; phasey *= -TWOPI; phasez *= -TWOPI; - switch (rank) { /* treat z as the last/truncated dimension always */ - case 3: break; -# if defined(HAVE_MPI) && ! defined(SCALAR_COMPLEX) - case 2: phasez = phasex; phasex = phasey; phasey = 0; break; -# else - case 2: phasez = phasey; phasey = 0; break; -# endif - case 1: phasez = phasex; phasex = phasey = 0; break; - } - CASSIGN_SCALAR(pz, cos(phasez), sin(phasez)); - phasex += phasez; - CASSIGN_SCALAR(pxz, cos(phasex), sin(phasex)); - phasex += phasey; - CASSIGN_SCALAR(pxyz, cos(phasex), sin(phasex)); - phasey += phasez; - CASSIGN_SCALAR(pyz, cos(phasey), sin(phasey)); - -/* convenience macros to copy cscalars, cscalars times phases, + int i, j, jmin = 1; + int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; +#ifdef HAVE_MPI + int local_x_start; +#endif + scalar_complex pz, pxz, pyz, pxyz; + + nxmax = nx = d->nx; + ny = d->ny; + nz = d->nz; + n_other = d->other_dims; + n_last = d->last_dim; + n_last_stored = d->last_dim_size / 2; + n_last_new = n_last - n_last_stored; /* < n_last_stored always */ + rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; + +#ifdef HAVE_MPI + local_x_start = d->local_y_start; + CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); + if (rank == 2) { + n_other = nx; + n_last_new = ny = d->local_ny; + if (local_x_start == 0) + --n_last_new; /* DC frequency should not be in other half */ + else + jmin = 0; + if (local_x_start + ny == n_last_stored && n_last % 2 == 0) + --n_last_new; /* Nyquist freq. should not be in other half */ + n_last_stored = ny; + } + else { /* rank == 3 */ + ny = nx; + nx = d->local_ny; + nxmax = local_x_start ? nx - 1 : nx; + n_other = nx * ny; + } +#endif /* HAVE_MPI */ + + /* compute p = exp(i*phase) factors: */ + phasex *= -TWOPI; + phasey *= -TWOPI; + phasez *= -TWOPI; + switch (rank) { /* treat z as the last/truncated dimension always */ + case 3: break; +#if defined(HAVE_MPI) && !defined(SCALAR_COMPLEX) + case 2: + phasez = phasex; + phasex = phasey; + phasey = 0; + break; +#else + case 2: + phasez = phasey; + phasey = 0; + break; +#endif + case 1: + phasez = phasex; + phasex = phasey = 0; + break; + } + CASSIGN_SCALAR(pz, cos(phasez), sin(phasez)); + phasex += phasez; + CASSIGN_SCALAR(pxz, cos(phasex), sin(phasex)); + phasex += phasey; + CASSIGN_SCALAR(pxyz, cos(phasex), sin(phasex)); + phasey += phasez; + CASSIGN_SCALAR(pyz, cos(phasey), sin(phasey)); + +/* convenience macros to copy cscalars, cscalars times phases, and conjugated cscalars (THIS IS THE ONLY CODE THAT WAS CHANGED COMPARED TO vectorfield_otherhalf): */ -# define ASSIGN_V(f,k,f2,k2) { f[k] = f2[k2]; } -# define ASSIGN_VP(f,k,f2,k2,p) { CASSIGN_MULT(f[k], f2[k2], p); } -# define ASSIGN_CV(f,k,f2,k2) { CASSIGN_CONJ(f[k], f2[k2]); } - - /* First, swap the order of elements and multiply by exp(ikR) - phase factors. We have to be careful here not to double-swap - any element pair; this is prevented by never swapping with a - "conjugated" point that is earlier in the array. */ - - if (rank == 3) { - int ix, iy; - for (ix = 0; 2*ix <= nxmax; ++ix) { - int xdiff, ixc; -# ifdef HAVE_MPI - if (local_x_start == 0) { - xdiff = ix != 0; ixc = (nx - ix) % nx; - } - else { - xdiff = 1; ixc = nx-1 - ix; - } -# else - xdiff = ix != 0; ixc = (nx - ix) % nx; -# endif - for (iy = 0; iy < ny; ++iy) { - int ydiff = iy != 0; - int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; - if (ic < i) - continue; - jmax = n_last_new; - if (ic == i) - jmax = (jmax + 1) / 2; - for (j = 1; j <= jmax; ++j) { - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; - scalar_complex f_tmp[3]; - switch (xdiff*2 + ydiff) { /* pick exp(-ikR) phase */ - case 3: /* xdiff && ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pxyz); - ASSIGN_VP(field, ijc, field, ij, pxyz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 2: /* xdiff && !ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pxz); - ASSIGN_VP(field, ijc, field, ij, pxz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 1: /* !xdiff && ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pyz); - ASSIGN_VP(field, ijc, field, ij, pyz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - case 0: /* !xdiff && !ydiff */ - ASSIGN_VP(f_tmp, 0, field, ijc, pz); - ASSIGN_VP(field, ijc, field, ij, pz); - ASSIGN_V(field, ij, f_tmp, 0); - break; - } - } - } - } - - /* Next, conjugate, and remove the holes from the array - corresponding to the DC and Nyquist frequencies (which were in - the first half already): */ - for (i = 0; i < n_other; ++i) - for (j = 1; j < n_last_new + 1; ++j) { - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; - ASSIGN_CV(field, ijnew, field, ij); - } - } - else /* if (rank <= 2) */ { - int i; - if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ - nx = 1; /* x dimension is handled by j (last dimension) loop */ - -# ifdef HAVE_MPI - for (i = 0; i < nx; ++i) -# else - for (i = 0; 2*i <= nx; ++i) -# endif - { - int xdiff = i != 0, ic = (nx - i) % nx; - int jmax = n_last_new + (jmin - 1); -# ifndef HAVE_MPI - if (ic == i) - jmax = (jmax + 1) / 2; -# endif - for (j = jmin; j <= jmax; ++j) { - scalar_complex f_tmp[3]; -# ifdef HAVE_MPI - int jc = jmax + jmin - j; - int ij = j * nx + i; - int ijc = jc * nx + ic; - if (ijc < ij) - break; -# else /* ! HAVE_MPI */ - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; -# endif /* ! HAVE_MPI */ - if (xdiff) { - ASSIGN_VP(f_tmp, 0, field, ijc, pxz); - ASSIGN_VP(field, ijc, field, ij, pxz); - ASSIGN_V(field, ij, f_tmp, 0); - } - else { - ASSIGN_VP(f_tmp, 0, field, ijc, pz); - ASSIGN_VP(field, ijc, field, ij, pz); - ASSIGN_V(field, ij, f_tmp, 0); - } - } - } - - /* Next, conjugate, and remove the holes from the array - corresponding to the DC and Nyquist frequencies (which were in - the first half already): */ - for (i = 0; i < nx; ++i) - for (j = jmin; j < n_last_new + jmin; ++j) { -# ifdef HAVE_MPI - int ij = j*nx + i, ijnew = (j-jmin)*nx + i; -# else - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; -# endif - ASSIGN_CV(field, ijnew, field, ij); - } - } - -# undef ASSIGN_V -# undef ASSIGN_VP -# undef ASSIGN_CV +#define ASSIGN_V(f, k, f2, k2) \ + { f[k] = f2[k2]; } +#define ASSIGN_VP(f, k, f2, k2, p) \ + { CASSIGN_MULT(f[k], f2[k2], p); } +#define ASSIGN_CV(f, k, f2, k2) \ + { CASSIGN_CONJ(f[k], f2[k2]); } + + /* First, swap the order of elements and multiply by exp(ikR) + phase factors. We have to be careful here not to double-swap + any element pair; this is prevented by never swapping with a + "conjugated" point that is earlier in the array. */ + + if (rank == 3) { + int ix, iy; + for (ix = 0; 2 * ix <= nxmax; ++ix) { + int xdiff, ixc; +#ifdef HAVE_MPI + if (local_x_start == 0) { + xdiff = ix != 0; + ixc = (nx - ix) % nx; + } + else { + xdiff = 1; + ixc = nx - 1 - ix; + } +#else + xdiff = ix != 0; + ixc = (nx - ix) % nx; +#endif + for (iy = 0; iy < ny; ++iy) { + int ydiff = iy != 0; + int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; + if (ic < i) continue; + jmax = n_last_new; + if (ic == i) jmax = (jmax + 1) / 2; + for (j = 1; j <= jmax; ++j) { + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; + scalar_complex f_tmp[3]; + switch (xdiff * 2 + ydiff) { /* pick exp(-ikR) phase */ + case 3: /* xdiff && ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pxyz); + ASSIGN_VP(field, ijc, field, ij, pxyz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 2: /* xdiff && !ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pxz); + ASSIGN_VP(field, ijc, field, ij, pxz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 1: /* !xdiff && ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pyz); + ASSIGN_VP(field, ijc, field, ij, pyz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + case 0: /* !xdiff && !ydiff */ + ASSIGN_VP(f_tmp, 0, field, ijc, pz); + ASSIGN_VP(field, ijc, field, ij, pz); + ASSIGN_V(field, ij, f_tmp, 0); + break; + } + } + } + } + + /* Next, conjugate, and remove the holes from the array + corresponding to the DC and Nyquist frequencies (which were in + the first half already): */ + for (i = 0; i < n_other; ++i) + for (j = 1; j < n_last_new + 1; ++j) { + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; + ASSIGN_CV(field, ijnew, field, ij); + } + } + else /* if (rank <= 2) */ { + int i; + if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ + nx = 1; /* x dimension is handled by j (last dimension) loop */ + +#ifdef HAVE_MPI + for (i = 0; i < nx; ++i) +#else + for (i = 0; 2 * i <= nx; ++i) +#endif + { + int xdiff = i != 0, ic = (nx - i) % nx; + int jmax = n_last_new + (jmin - 1); +#ifndef HAVE_MPI + if (ic == i) jmax = (jmax + 1) / 2; +#endif + for (j = jmin; j <= jmax; ++j) { + scalar_complex f_tmp[3]; +#ifdef HAVE_MPI + int jc = jmax + jmin - j; + int ij = j * nx + i; + int ijc = jc * nx + ic; + if (ijc < ij) break; +#else /* ! HAVE_MPI */ + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; +#endif /* ! HAVE_MPI */ + if (xdiff) { + ASSIGN_VP(f_tmp, 0, field, ijc, pxz); + ASSIGN_VP(field, ijc, field, ij, pxz); + ASSIGN_V(field, ij, f_tmp, 0); + } + else { + ASSIGN_VP(f_tmp, 0, field, ijc, pz); + ASSIGN_VP(field, ijc, field, ij, pz); + ASSIGN_V(field, ij, f_tmp, 0); + } + } + } + + /* Next, conjugate, and remove the holes from the array + corresponding to the DC and Nyquist frequencies (which were in + the first half already): */ + for (i = 0; i < nx; ++i) + for (j = jmin; j < n_last_new + jmin; ++j) { +#ifdef HAVE_MPI + int ij = j * nx + i, ijnew = (j - jmin) * nx + i; +#else + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; +#endif + ASSIGN_CV(field, ijnew, field, ij); + } + } + +#undef ASSIGN_V +#undef ASSIGN_VP +#undef ASSIGN_CV #endif /* ! SCALAR_COMPLEX */ } @@ -1039,211 +989,195 @@ void maxwell_cscalarfield_otherhalf(maxwell_data *d, scalar_complex *field, a real scalar field, which is assumed to have come from e.g. the absolute values of a complex field (and thus no phase factors or conjugations are necessary). */ -void maxwell_scalarfield_otherhalf(maxwell_data *d, real *field) -{ +void maxwell_scalarfield_otherhalf(maxwell_data *d, real *field) { #ifndef SCALAR_COMPLEX - int i, j, jmin = 1; - int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; -# ifdef HAVE_MPI - int local_x_start; -# endif - - nxmax = nx = d->nx; ny = d->ny; nz = d->nz; - n_other = d->other_dims; - n_last = d->last_dim; - n_last_stored = d->last_dim_size / 2; - n_last_new = n_last - n_last_stored; /* < n_last_stored always */ - rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; - -# ifdef HAVE_MPI - local_x_start = d->local_y_start; - CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); - if (rank == 2) { - n_other = nx; - n_last_new = ny = d->local_ny; - if (local_x_start == 0) - --n_last_new; /* DC frequency should not be in other half */ - else - jmin = 0; - if (local_x_start + ny == n_last_stored && n_last % 2 == 0) - --n_last_new; /* Nyquist freq. should not be in other half */ - n_last_stored = ny; - } - else { /* rank == 3 */ - ny = nx; - nx = d->local_ny; - nxmax = local_x_start ? nx - 1 : nx; - n_other = nx * ny; - } -# endif /* HAVE_MPI */ - - /* First, swap the order of elements and multiply by exp(ikR) - phase factors. We have to be careful here not to double-swap - any element pair; this is prevented by never swapping with a - "conjugated" point that is earlier in the array. */ - - if (rank == 3) { - int ix, iy; - for (ix = 0; 2*ix <= nxmax; ++ix) { - int ixc; -# ifdef HAVE_MPI - if (local_x_start == 0) - ixc = (nx - ix) % nx; - else - ixc = nx-1 - ix; -# else - ixc = (nx - ix) % nx; -# endif - for (iy = 0; iy < ny; ++iy) { - int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; - if (ic < i) - continue; - jmax = n_last_new; - if (ic == i) - jmax = (jmax + 1) / 2; - for (j = 1; j <= jmax; ++j) { - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; - real f_tmp; - f_tmp = field[ijc]; - field[ijc] = field[ij]; - field[ij] = f_tmp; - } - } - } - - /* Next, conjugate, and remove the holes from the array - corresponding to the DC and Nyquist frequencies (which were in - the first half already): */ - for (i = 0; i < n_other; ++i) - for (j = 1; j < n_last_new + 1; ++j) { - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; - field[ijnew] = field[ij]; - } - } - else /* if (rank <= 2) */ { - int i; - if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ - nx = 1; /* x dimension is handled by j (last dimension) loop */ - -# ifdef HAVE_MPI - for (i = 0; i < nx; ++i) -# else - for (i = 0; 2*i <= nx; ++i) -# endif - { - int ic = (nx - i) % nx; - int jmax = n_last_new + (jmin - 1); -# ifndef HAVE_MPI - if (ic == i) - jmax = (jmax + 1) / 2; -# endif - for (j = jmin; j <= jmax; ++j) { - real f_tmp; -# ifdef HAVE_MPI - int jc = jmax + jmin - j; - int ij = j * nx + i; - int ijc = jc * nx + ic; - if (ijc < ij) - break; -# else /* ! HAVE_MPI */ - int jc = n_last_new + 1 - j; - int ij = i*n_last_stored + j; - int ijc = ic*n_last_stored + jc; -# endif /* ! HAVE_MPI */ - f_tmp = field[ijc]; - field[ijc] = field[ij]; - field[ij] = f_tmp; - } - } - - /* Next, remove the holes from the array corresponding to - the DC and Nyquist frequencies (which were in the first - half already): */ - for (i = 0; i < nx; ++i) - for (j = jmin; j < n_last_new + jmin; ++j) { -# ifdef HAVE_MPI - int ij = j*nx + i, ijnew = (j-jmin)*nx + i; -# else - int ij = i*n_last_stored + j, ijnew = i*n_last_new + j-1; -# endif - field[ijnew] = field[ij]; - } - } + int i, j, jmin = 1; + int rank, n_other, n_last, n_last_stored, n_last_new, nx, ny, nz, nxmax; +#ifdef HAVE_MPI + int local_x_start; +#endif + + nxmax = nx = d->nx; + ny = d->ny; + nz = d->nz; + n_other = d->other_dims; + n_last = d->last_dim; + n_last_stored = d->last_dim_size / 2; + n_last_new = n_last - n_last_stored; /* < n_last_stored always */ + rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; + +#ifdef HAVE_MPI + local_x_start = d->local_y_start; + CHECK(rank == 2 || rank == 3, "unsupported rfftwnd_mpi rank!"); + if (rank == 2) { + n_other = nx; + n_last_new = ny = d->local_ny; + if (local_x_start == 0) + --n_last_new; /* DC frequency should not be in other half */ + else + jmin = 0; + if (local_x_start + ny == n_last_stored && n_last % 2 == 0) + --n_last_new; /* Nyquist freq. should not be in other half */ + n_last_stored = ny; + } + else { /* rank == 3 */ + ny = nx; + nx = d->local_ny; + nxmax = local_x_start ? nx - 1 : nx; + n_other = nx * ny; + } +#endif /* HAVE_MPI */ + + /* First, swap the order of elements and multiply by exp(ikR) + phase factors. We have to be careful here not to double-swap + any element pair; this is prevented by never swapping with a + "conjugated" point that is earlier in the array. */ + + if (rank == 3) { + int ix, iy; + for (ix = 0; 2 * ix <= nxmax; ++ix) { + int ixc; +#ifdef HAVE_MPI + if (local_x_start == 0) + ixc = (nx - ix) % nx; + else + ixc = nx - 1 - ix; +#else + ixc = (nx - ix) % nx; +#endif + for (iy = 0; iy < ny; ++iy) { + int i = ix * ny + iy, ic = ixc * ny + (ny - iy) % ny, jmax; + if (ic < i) continue; + jmax = n_last_new; + if (ic == i) jmax = (jmax + 1) / 2; + for (j = 1; j <= jmax; ++j) { + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; + real f_tmp; + f_tmp = field[ijc]; + field[ijc] = field[ij]; + field[ij] = f_tmp; + } + } + } + + /* Next, conjugate, and remove the holes from the array + corresponding to the DC and Nyquist frequencies (which were in + the first half already): */ + for (i = 0; i < n_other; ++i) + for (j = 1; j < n_last_new + 1; ++j) { + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; + field[ijnew] = field[ij]; + } + } + else /* if (rank <= 2) */ { + int i; + if (rank == 1) /* (note that 1d MPI transforms are not allowed) */ + nx = 1; /* x dimension is handled by j (last dimension) loop */ + +#ifdef HAVE_MPI + for (i = 0; i < nx; ++i) +#else + for (i = 0; 2 * i <= nx; ++i) +#endif + { + int ic = (nx - i) % nx; + int jmax = n_last_new + (jmin - 1); +#ifndef HAVE_MPI + if (ic == i) jmax = (jmax + 1) / 2; +#endif + for (j = jmin; j <= jmax; ++j) { + real f_tmp; +#ifdef HAVE_MPI + int jc = jmax + jmin - j; + int ij = j * nx + i; + int ijc = jc * nx + ic; + if (ijc < ij) break; +#else /* ! HAVE_MPI */ + int jc = n_last_new + 1 - j; + int ij = i * n_last_stored + j; + int ijc = ic * n_last_stored + jc; +#endif /* ! HAVE_MPI */ + f_tmp = field[ijc]; + field[ijc] = field[ij]; + field[ij] = f_tmp; + } + } + + /* Next, remove the holes from the array corresponding to + the DC and Nyquist frequencies (which were in the first + half already): */ + for (i = 0; i < nx; ++i) + for (j = jmin; j < n_last_new + jmin; ++j) { +#ifdef HAVE_MPI + int ij = j * nx + i, ijnew = (j - jmin) * nx + i; +#else + int ij = i * n_last_stored + j, ijnew = i * n_last_new + j - 1; +#endif + field[ijnew] = field[ij]; + } + } #endif /* ! SCALAR_COMPLEX */ } /**************************************************************************/ -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) /* Compute Xout = 1/mu curl(1/epsilon * curl(Xin)) 1/mu */ -void maxwell_operator(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - maxwell_data *d = (maxwell_data *) data; - int cur_band_start; - scalar_complex *cdata; - real scale; - - CHECK(d, "null maxwell data pointer!"); - CHECK(Xin.c == 2, "fields don't have 2 components!"); - - (void) is_current_eigenvector; /* unused */ - (void) Work; - - cdata = (scalar_complex *) d->fft_data; - scale = -1.0 / Xout.N; /* scale factor to normalize FFT; - negative sign comes from 2 i's from curls */ - - /* compute the operator, num_fft_bands at a time: */ - for (cur_band_start = 0; cur_band_start < Xin.p; - cur_band_start += d->num_fft_bands) { - int cur_num_bands = MIN2(d->num_fft_bands, Xin.p - cur_band_start); - - if (d->mu_inv == NULL) - maxwell_compute_d_from_H(d, Xin, cdata, - cur_band_start, cur_num_bands); - else { - maxwell_compute_H_from_B(d, Xin, Xout, cdata, - cur_band_start, cur_band_start, - cur_num_bands); - maxwell_compute_d_from_H(d, Xout, cdata, - cur_band_start, cur_num_bands); - } - maxwell_compute_e_from_d(d, cdata, cur_num_bands); - maxwell_compute_H_from_e(d, Xout, cdata, - cur_band_start, cur_num_bands, scale); - maxwell_compute_H_from_B(d, Xout, Xout, cdata, - cur_band_start, cur_band_start, - cur_num_bands); - } +void maxwell_operator(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, + evectmatrix Work) { + maxwell_data *d = (maxwell_data *)data; + int cur_band_start; + scalar_complex *cdata; + real scale; + + CHECK(d, "null maxwell data pointer!"); + CHECK(Xin.c == 2, "fields don't have 2 components!"); + + (void)is_current_eigenvector; /* unused */ + (void)Work; + + cdata = (scalar_complex *)d->fft_data; + scale = -1.0 / Xout.N; /* scale factor to normalize FFT; + negative sign comes from 2 i's from curls */ + + /* compute the operator, num_fft_bands at a time: */ + for (cur_band_start = 0; cur_band_start < Xin.p; cur_band_start += d->num_fft_bands) { + int cur_num_bands = MIN2(d->num_fft_bands, Xin.p - cur_band_start); + + if (d->mu_inv == NULL) + maxwell_compute_d_from_H(d, Xin, cdata, cur_band_start, cur_num_bands); + else { + maxwell_compute_H_from_B(d, Xin, Xout, cdata, cur_band_start, cur_band_start, cur_num_bands); + maxwell_compute_d_from_H(d, Xout, cdata, cur_band_start, cur_num_bands); + } + maxwell_compute_e_from_d(d, cdata, cur_num_bands); + maxwell_compute_H_from_e(d, Xout, cdata, cur_band_start, cur_num_bands, scale); + maxwell_compute_H_from_B(d, Xout, Xout, cdata, cur_band_start, cur_band_start, cur_num_bands); + } } void maxwell_muinv_operator(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - maxwell_data *d = (maxwell_data *) data; - int cur_band_start; - scalar_complex *cdata; - - CHECK(d, "null maxwell data pointer!"); - CHECK(Xin.c == 2, "fields don't have 2 components!"); - - (void) is_current_eigenvector; /* unused */ - (void) Work; - - cdata = (scalar_complex *) d->fft_data; - - /* compute the operator, num_fft_bands at a time: */ - for (cur_band_start = 0; cur_band_start < Xout.p; - cur_band_start += d->num_fft_bands) { - int cur_num_bands = MIN2(d->num_fft_bands, Xout.p - cur_band_start); - maxwell_compute_H_from_B(d, Xin, Xout, cdata, - cur_band_start, cur_band_start, - cur_num_bands); - } + int is_current_eigenvector, evectmatrix Work) { + maxwell_data *d = (maxwell_data *)data; + int cur_band_start; + scalar_complex *cdata; + + CHECK(d, "null maxwell data pointer!"); + CHECK(Xin.c == 2, "fields don't have 2 components!"); + + (void)is_current_eigenvector; /* unused */ + (void)Work; + + cdata = (scalar_complex *)d->fft_data; + + /* compute the operator, num_fft_bands at a time: */ + for (cur_band_start = 0; cur_band_start < Xout.p; cur_band_start += d->num_fft_bands) { + int cur_num_bands = MIN2(d->num_fft_bands, Xout.p - cur_band_start); + maxwell_compute_H_from_B(d, Xin, Xout, cdata, cur_band_start, cur_band_start, cur_num_bands); + } } /* Compute the operation Xout = (M - w^2) Xin, where M is the Maxwell @@ -1253,13 +1187,12 @@ void maxwell_muinv_operator(evectmatrix Xin, evectmatrix Xout, void *data, operator is not positive-definite), and the smallest eigenvectors (not taking the absolute value) are the same as those of M. */ void maxwell_target_operator1(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - maxwell_target_data *d = (maxwell_target_data *) data; - real omega_sqr = d->target_frequency * d->target_frequency; + int is_current_eigenvector, evectmatrix Work) { + maxwell_target_data *d = (maxwell_target_data *)data; + real omega_sqr = d->target_frequency * d->target_frequency; - maxwell_operator(Xin, Xout, d->d, is_current_eigenvector, Work); - evectmatrix_aXpbY(1.0, Xout, -omega_sqr, Xin); + maxwell_operator(Xin, Xout, d->d, is_current_eigenvector, Work); + evectmatrix_aXpbY(1.0, Xout, -omega_sqr, Xin); } /* Compute the operation Xout = (M - w^2)^2 Xin, where M is the @@ -1267,68 +1200,58 @@ void maxwell_target_operator1(evectmatrix Xin, evectmatrix Xout, void *data, eigenvalue spectrum so that the smallest eigenvalues are those nearest to w. */ void maxwell_target_operator(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - if (Xin.n != 0) - CHECK(Work.data && Work.data != Xin.data && Work.data != Xout.data, - "maxwell_target_operator must have distinct workspace!"); - - maxwell_target_operator1(Xin, Work, data, is_current_eigenvector, Xout); - - /* N.B. maxwell_operator(), and thus maxwell_target_operator1(), - doesn't actually need the workspace, so we can safely pass - Work here for the scratch parameter: */ - maxwell_target_operator1(Work, Xout, data, is_current_eigenvector, Work); + int is_current_eigenvector, evectmatrix Work) { + if (Xin.n != 0) + CHECK(Work.data && Work.data != Xin.data && Work.data != Xout.data, + "maxwell_target_operator must have distinct workspace!"); + + maxwell_target_operator1(Xin, Work, data, is_current_eigenvector, Xout); + + /* N.B. maxwell_operator(), and thus maxwell_target_operator1(), + doesn't actually need the workspace, so we can safely pass + Work here for the scratch parameter: */ + maxwell_target_operator1(Work, Xout, data, is_current_eigenvector, Work); } -/* Compute the operation Xout = curl 1/epsilon * i u x Xin, which +/* Compute the operation Xout = curl 1/epsilon * i u x Xin, which is useful operation in computing the group velocity (derivative of the maxwell operator). u is a vector in cartesian coordinates. */ -void maxwell_ucross_op(evectmatrix Xin, evectmatrix Xout, - maxwell_data *d, const real u[3]) -{ - scalar *fft_data, *fft_data_in; - scalar_complex *cdata; - real scale; - int cur_band_start; - int i, j, b; - - CHECK(d, "null maxwell data pointer!"); - CHECK(Xin.c == 2, "fields don't have 2 components!"); - - cdata = (scalar_complex *) (fft_data = d->fft_data); - fft_data_in = d->fft_data2; - - scale = -1.0 / Xout.N; /* scale factor to normalize FFT; - negative sign comes from 2 i's from curls */ - - /* compute the operator, num_fft_bands at a time: */ - for (cur_band_start = 0; cur_band_start < Xin.p; - cur_band_start += d->num_fft_bands) { - int cur_num_bands = MIN2(d->num_fft_bands, Xin.p - cur_band_start); - - /* first, compute fft_data = u x Xin: */ - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_ucross_t2c(&fft_data_in[3 * (ij2*cur_num_bands - + b)], - u, cur_k, - &Xin.data[ij * 2 * Xin.p + - b + cur_band_start], - Xin.p); - } - - /* now, convert to position space via FFT: */ - maxwell_compute_fft(+1, d, fft_data_in, fft_data, - cur_num_bands*3, cur_num_bands*3, 1); - - maxwell_compute_e_from_d(d, cdata, cur_num_bands); - maxwell_compute_H_from_e(d, Xout, cdata, - cur_band_start, cur_num_bands, scale); - } +void maxwell_ucross_op(evectmatrix Xin, evectmatrix Xout, maxwell_data *d, const real u[3]) { + scalar *fft_data, *fft_data_in; + scalar_complex *cdata; + real scale; + int cur_band_start; + int i, j, b; + + CHECK(d, "null maxwell data pointer!"); + CHECK(Xin.c == 2, "fields don't have 2 components!"); + + cdata = (scalar_complex *)(fft_data = d->fft_data); + fft_data_in = d->fft_data2; + + scale = -1.0 / Xout.N; /* scale factor to normalize FFT; + negative sign comes from 2 i's from curls */ + + /* compute the operator, num_fft_bands at a time: */ + for (cur_band_start = 0; cur_band_start < Xin.p; cur_band_start += d->num_fft_bands) { + int cur_num_bands = MIN2(d->num_fft_bands, Xin.p - cur_band_start); + + /* first, compute fft_data = u x Xin: */ + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_ucross_t2c(&fft_data_in[3 * (ij2 * cur_num_bands + b)], u, cur_k, + &Xin.data[ij * 2 * Xin.p + b + cur_band_start], Xin.p); + } + + /* now, convert to position space via FFT: */ + maxwell_compute_fft(+1, d, fft_data_in, fft_data, cur_num_bands * 3, cur_num_bands * 3, 1); + + maxwell_compute_e_from_d(d, cdata, cur_num_bands); + maxwell_compute_H_from_e(d, Xout, cdata, cur_band_start, cur_num_bands, scale); + } } diff --git a/src/maxwell/maxwell_pre.c b/src/maxwell/maxwell_pre.c index 8eddd96e..27589f51 100644 --- a/src/maxwell/maxwell_pre.c +++ b/src/maxwell/maxwell_pre.c @@ -28,109 +28,96 @@ #define PRECOND_SUBTR_EIGS 0 #define PRECOND_MIN_DENOM 5e-3 -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) -#define MAX2(a,b) ((a) > (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) +#define MAX2(a, b) ((a) > (b) ? (a) : (b)) /* used to be: MAX2(x, PRECOND_MIN_DENOM) but now zero k-point is handled specially */ #define FIX_DENOM(x) ((x) == 0 ? 1.0 : (x)) -void maxwell_simple_precondition(evectmatrix X, void *data, real *eigenvals) -{ - maxwell_data *d = (maxwell_data *) data; - int i, c, b; - real *kpGn2 = d->k_plus_G_normsqr; +void maxwell_simple_precondition(evectmatrix X, void *data, real *eigenvals) { + maxwell_data *d = (maxwell_data *)data; + int i, c, b; + real *kpGn2 = d->k_plus_G_normsqr; #if !PRECOND_SUBTR_EIGS - (void) eigenvals; /* unused */ + (void)eigenvals; /* unused */ #endif - for (i = 0; i < X.localN; ++i) { - for (c = 0; c < X.c; ++c) { - for (b = 0; b < X.p; ++b) { - int index = (i * X.c + c) * X.p + b; - real scale = kpGn2[i] * d->eps_inv_mean; + for (i = 0; i < X.localN; ++i) { + for (c = 0; c < X.c; ++c) { + for (b = 0; b < X.p; ++b) { + int index = (i * X.c + c) * X.p + b; + real scale = kpGn2[i] * d->eps_inv_mean; #if PRECOND_SUBTR_EIGS - if (eigenvals) { - scale -= eigenvals[b]; - scale = 1.0 / (scale + copysign(PRECOND_MIN_DENOM, - scale)); - } - else + if (eigenvals) { + scale -= eigenvals[b]; + scale = 1.0 / (scale + copysign(PRECOND_MIN_DENOM, scale)); + } + else #else - { - scale = 1.0 / FIX_DENOM(scale); - } + { scale = 1.0 / FIX_DENOM(scale); } #endif - ASSIGN_SCALAR(X.data[index], - scale * SCALAR_RE(X.data[index]), - scale * SCALAR_IM(X.data[index])); - } - } - } + ASSIGN_SCALAR(X.data[index], scale * SCALAR_RE(X.data[index]), + scale * SCALAR_IM(X.data[index])); + } + } + } } -void maxwell_preconditioner(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY) -{ - (void) Y; /* unused */ - evectmatrix_XeYS(Xout, Xin, YtY, 1); - maxwell_simple_precondition(Xout, data, eigenvals); +void maxwell_preconditioner(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY) { + (void)Y; /* unused */ + evectmatrix_XeYS(Xout, Xin, YtY, 1); + maxwell_simple_precondition(Xout, data, eigenvals); } -void maxwell_target_preconditioner(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY) -{ - maxwell_target_data *td = (maxwell_target_data *) data; - maxwell_data *d = td->d; +void maxwell_target_preconditioner(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY) { + maxwell_target_data *td = (maxwell_target_data *)data; + maxwell_data *d = td->d; #if PRECOND_SUBTR_EIGS - real omega_sqr = td->target_frequency * td->target_frequency; + real omega_sqr = td->target_frequency * td->target_frequency; #endif - int i, c, b; - real *kpGn2 = d->k_plus_G_normsqr; + int i, c, b; + real *kpGn2 = d->k_plus_G_normsqr; - (void) Y; /* unused */ + (void)Y; /* unused */ #if !PRECOND_SUBTR_EIGS - (void) eigenvals; /* unused */ + (void)eigenvals; /* unused */ #endif - evectmatrix_XeYS(Xout, Xin, YtY, 1); + evectmatrix_XeYS(Xout, Xin, YtY, 1); - for (i = 0; i < Xout.localN; ++i) { - for (c = 0; c < Xout.c; ++c) { - for (b = 0; b < Xout.p; ++b) { - int index = (i * Xout.c + c) * Xout.p + b; - real scale = kpGn2[i] * d->eps_inv_mean; + for (i = 0; i < Xout.localN; ++i) { + for (c = 0; c < Xout.c; ++c) { + for (b = 0; b < Xout.p; ++b) { + int index = (i * Xout.c + c) * Xout.p + b; + real scale = kpGn2[i] * d->eps_inv_mean; #if PRECOND_SUBTR_EIGS - scale -= omega_sqr; + scale -= omega_sqr; #endif - scale = scale * scale; + scale = scale * scale; #if PRECOND_SUBTR_EIGS - if (eigenvals) { - scale -= eigenvals[b]; - scale = 1.0 / (scale + copysign(PRECOND_MIN_DENOM, - scale)); - } - else + if (eigenvals) { + scale -= eigenvals[b]; + scale = 1.0 / (scale + copysign(PRECOND_MIN_DENOM, scale)); + } + else #else - { - scale = 1.0 / FIX_DENOM(scale); - } + { scale = 1.0 / FIX_DENOM(scale); } #endif - ASSIGN_SCALAR(Xout.data[index], - scale * SCALAR_RE(Xout.data[index]), - scale * SCALAR_IM(Xout.data[index])); - } - } - } + ASSIGN_SCALAR(Xout.data[index], scale * SCALAR_RE(Xout.data[index]), + scale * SCALAR_IM(Xout.data[index])); + } + } + } } /**************************************************************************/ @@ -141,181 +128,150 @@ void maxwell_target_preconditioner(evectmatrix Xin, evectmatrix Xout, Here, a = (a[0],a[1],a[2]) and k = (k.kx,k.ky,k.kz) are in cartesian coordinates. (v[0],v[vstride]) is in the transverse basis of - k.m and k.n. + k.m and k.n. We can't compute 'a' exactly, since there is no way to find the component of a parallel to k. So, we only compute the transverse component of 'a'--this is the main approximation in our preconditioner. */ -static void assign_crossinv_t2c(scalar *a, const k_data k, - const scalar *v, int vstride) -{ - /* k x v = k x (k x a) = (k*a)k - k^2 a - = -(a_transverse) * k^2 - ==> a_transverse = -1/k^2 * k x v */ - - /* Thus, we just do the same thing as assign_cross_t2c - in maxwell_op.c, except that we divide by -k^2: */ - - scalar v0 = v[0], v1 = v[vstride]; - real kmag_inv = -1.0 / FIX_DENOM(k.kmag); - - ASSIGN_SCALAR(a[0], - (SCALAR_RE(v0)*k.nx - SCALAR_RE(v1)*k.mx) * kmag_inv, - (SCALAR_IM(v0)*k.nx - SCALAR_IM(v1)*k.mx) * kmag_inv); - ASSIGN_SCALAR(a[1], - (SCALAR_RE(v0)*k.ny - SCALAR_RE(v1)*k.my) * kmag_inv, - (SCALAR_IM(v0)*k.ny - SCALAR_IM(v1)*k.my) * kmag_inv); - ASSIGN_SCALAR(a[2], - (SCALAR_RE(v0)*k.nz - SCALAR_RE(v1)*k.mz) * kmag_inv, - (SCALAR_IM(v0)*k.nz - SCALAR_IM(v1)*k.mz) * kmag_inv); +static void assign_crossinv_t2c(scalar *a, const k_data k, const scalar *v, int vstride) { + /* k x v = k x (k x a) = (k*a)k - k^2 a + = -(a_transverse) * k^2 + ==> a_transverse = -1/k^2 * k x v */ + + /* Thus, we just do the same thing as assign_cross_t2c + in maxwell_op.c, except that we divide by -k^2: */ + + scalar v0 = v[0], v1 = v[vstride]; + real kmag_inv = -1.0 / FIX_DENOM(k.kmag); + + ASSIGN_SCALAR(a[0], (SCALAR_RE(v0) * k.nx - SCALAR_RE(v1) * k.mx) * kmag_inv, + (SCALAR_IM(v0) * k.nx - SCALAR_IM(v1) * k.mx) * kmag_inv); + ASSIGN_SCALAR(a[1], (SCALAR_RE(v0) * k.ny - SCALAR_RE(v1) * k.my) * kmag_inv, + (SCALAR_IM(v0) * k.ny - SCALAR_IM(v1) * k.my) * kmag_inv); + ASSIGN_SCALAR(a[2], (SCALAR_RE(v0) * k.nz - SCALAR_RE(v1) * k.mz) * kmag_inv, + (SCALAR_IM(v0) * k.nz - SCALAR_IM(v1) * k.mz) * kmag_inv); } /* Compute 'v' * scale, where a = k x v, going from cartesian to transverse coordinates. Since v is tranvserse to k, we can compute this inverse exactly. */ -static void assign_crossinv_c2t(scalar *v, int vstride, - const k_data k, const scalar *a, - real scale) -{ - /* As in assign_crossinv_t2c above, we find: - - v = v_transverse = -1/k^2 * k x a - - So, we do the same thing as in assign_cross_c2t of maxwell_op.c, - with the additional -1/k^2 factor. */ - - scalar a0 = a[0], a1 = a[1], a2 = a[2]; - scalar at0, at1; - - ASSIGN_SCALAR(at0, - SCALAR_RE(a0)*k.mx + SCALAR_RE(a1)*k.my + SCALAR_RE(a2)*k.mz, - SCALAR_IM(a0)*k.mx + SCALAR_IM(a1)*k.my + SCALAR_IM(a2)*k.mz); - ASSIGN_SCALAR(at1, - SCALAR_RE(a0)*k.nx + SCALAR_RE(a1)*k.ny + SCALAR_RE(a2)*k.nz, - SCALAR_IM(a0)*k.nx + SCALAR_IM(a1)*k.ny + SCALAR_IM(a2)*k.nz); - - /* combine scale factor and k * (-1/k^2) */ - scale = -scale / FIX_DENOM(k.kmag); - - ASSIGN_SCALAR(v[0], - - scale * SCALAR_RE(at1), - - scale * SCALAR_IM(at1)); - ASSIGN_SCALAR(v[vstride], - scale * SCALAR_RE(at0), - scale * SCALAR_IM(at0)); +static void assign_crossinv_c2t(scalar *v, int vstride, const k_data k, const scalar *a, + real scale) { + /* As in assign_crossinv_t2c above, we find: + + v = v_transverse = -1/k^2 * k x a + + So, we do the same thing as in assign_cross_c2t of maxwell_op.c, + with the additional -1/k^2 factor. */ + + scalar a0 = a[0], a1 = a[1], a2 = a[2]; + scalar at0, at1; + + ASSIGN_SCALAR(at0, SCALAR_RE(a0) * k.mx + SCALAR_RE(a1) * k.my + SCALAR_RE(a2) * k.mz, + SCALAR_IM(a0) * k.mx + SCALAR_IM(a1) * k.my + SCALAR_IM(a2) * k.mz); + ASSIGN_SCALAR(at1, SCALAR_RE(a0) * k.nx + SCALAR_RE(a1) * k.ny + SCALAR_RE(a2) * k.nz, + SCALAR_IM(a0) * k.nx + SCALAR_IM(a1) * k.ny + SCALAR_IM(a2) * k.nz); + + /* combine scale factor and k * (-1/k^2) */ + scale = -scale / FIX_DENOM(k.kmag); + + ASSIGN_SCALAR(v[0], -scale * SCALAR_RE(at1), -scale * SCALAR_IM(at1)); + ASSIGN_SCALAR(v[vstride], scale * SCALAR_RE(at0), scale * SCALAR_IM(at0)); } /* Fancy preconditioner. This is very similar to maxwell_op, except that the steps are (approximately) inverted: */ -void maxwell_preconditioner2(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY) -{ - maxwell_data *d = (maxwell_data *) data; - int cur_band_start; - scalar *fft_data, *fft_data2; - scalar_complex *cdata; - real scale; - int i, j, b; - - (void) Y; /* unused */ - (void) eigenvals; /* unused */ - - CHECK(d, "null maxwell data pointer!"); - CHECK(Xin.c == 2, "fields don't have 2 components!"); - - if (Xout.data != Xin.data) - evectmatrix_XeYS(Xout, Xin, YtY, 1); - - fft_data = d->fft_data; - fft_data2 = d->fft_data2; - cdata = (scalar_complex *) fft_data; - - scale = -1.0 / Xout.N; /* scale factor to normalize FFT; - negative sign comes from 2 i's from curls */ - - for (cur_band_start = 0; cur_band_start < Xout.p; - cur_band_start += d->num_fft_bands) { - int cur_num_bands = MIN2(d->num_fft_bands, Xout.p - cur_band_start); - - /********************************************/ - /* Compute approx. inverse of curl (inverse cross product with k): */ - - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_crossinv_t2c(&fft_data2[3 * (ij2*cur_num_bands - + b)], - cur_k, - &Xout.data[ij * 2 * Xout.p + - b + cur_band_start], - Xout.p); - } - - /********************************************/ - /* Multiply by epsilon: */ - - /* convert to position space via FFT: */ - maxwell_compute_fft(+1, d, fft_data2, fft_data, - cur_num_bands*3, cur_num_bands*3, 1); - - /* multiply by epsilon in position space. Don't bother to - invert the whole epsilon-inverse tensor; just take the - inverse of the average epsilon-inverse (= trace / 3). */ - for (i = 0; i < d->fft_output_size; ++i) { - symmetric_matrix eps_inv = d->eps_inv[i]; - real eps = 3.0 / (eps_inv.m00 + eps_inv.m11 + eps_inv.m22); - for (b = 0; b < cur_num_bands; ++b) { - int ib = 3 * (i * cur_num_bands + b); - cdata[ib].re *= eps; - cdata[ib].im *= eps; - cdata[ib+1].re *= eps; - cdata[ib+1].im *= eps; - cdata[ib+2].re *= eps; - cdata[ib+2].im *= eps; - } - } - - /* convert back to Fourier space */ - maxwell_compute_fft(-1, d, fft_data, fft_data2, - cur_num_bands*3, cur_num_bands*3, 1); - - /********************************************/ - /* Finally, do second inverse curl (inverse cross product with k): */ - - for (i = 0; i < d->other_dims; ++i) - for (j = 0; j < d->last_dim; ++j) { - int ij = i * d->last_dim + j; - int ij2 = i * d->last_dim_size + j; - k_data cur_k = d->k_plus_G[ij]; - - for (b = 0; b < cur_num_bands; ++b) - assign_crossinv_c2t(&Xout.data[ij * 2 * Xout.p + - b + cur_band_start], - Xout.p, - cur_k, - &fft_data2[3 * (ij2*cur_num_bands - + b)], - scale); - } - - /********************************************/ - - } /* end of cur_band_start loop */ +void maxwell_preconditioner2(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY) { + maxwell_data *d = (maxwell_data *)data; + int cur_band_start; + scalar *fft_data, *fft_data2; + scalar_complex *cdata; + real scale; + int i, j, b; + + (void)Y; /* unused */ + (void)eigenvals; /* unused */ + + CHECK(d, "null maxwell data pointer!"); + CHECK(Xin.c == 2, "fields don't have 2 components!"); + + if (Xout.data != Xin.data) evectmatrix_XeYS(Xout, Xin, YtY, 1); + + fft_data = d->fft_data; + fft_data2 = d->fft_data2; + cdata = (scalar_complex *)fft_data; + + scale = -1.0 / Xout.N; /* scale factor to normalize FFT; + negative sign comes from 2 i's from curls */ + + for (cur_band_start = 0; cur_band_start < Xout.p; cur_band_start += d->num_fft_bands) { + int cur_num_bands = MIN2(d->num_fft_bands, Xout.p - cur_band_start); + + /********************************************/ + /* Compute approx. inverse of curl (inverse cross product with k): */ + + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_crossinv_t2c(&fft_data2[3 * (ij2 * cur_num_bands + b)], cur_k, + &Xout.data[ij * 2 * Xout.p + b + cur_band_start], Xout.p); + } + + /********************************************/ + /* Multiply by epsilon: */ + + /* convert to position space via FFT: */ + maxwell_compute_fft(+1, d, fft_data2, fft_data, cur_num_bands * 3, cur_num_bands * 3, 1); + + /* multiply by epsilon in position space. Don't bother to + invert the whole epsilon-inverse tensor; just take the + inverse of the average epsilon-inverse (= trace / 3). */ + for (i = 0; i < d->fft_output_size; ++i) { + symmetric_matrix eps_inv = d->eps_inv[i]; + real eps = 3.0 / (eps_inv.m00 + eps_inv.m11 + eps_inv.m22); + for (b = 0; b < cur_num_bands; ++b) { + int ib = 3 * (i * cur_num_bands + b); + cdata[ib].re *= eps; + cdata[ib].im *= eps; + cdata[ib + 1].re *= eps; + cdata[ib + 1].im *= eps; + cdata[ib + 2].re *= eps; + cdata[ib + 2].im *= eps; + } + } + + /* convert back to Fourier space */ + maxwell_compute_fft(-1, d, fft_data, fft_data2, cur_num_bands * 3, cur_num_bands * 3, 1); + + /********************************************/ + /* Finally, do second inverse curl (inverse cross product with k): */ + + for (i = 0; i < d->other_dims; ++i) + for (j = 0; j < d->last_dim; ++j) { + int ij = i * d->last_dim + j; + int ij2 = i * d->last_dim_size + j; + k_data cur_k = d->k_plus_G[ij]; + + for (b = 0; b < cur_num_bands; ++b) + assign_crossinv_c2t(&Xout.data[ij * 2 * Xout.p + b + cur_band_start], Xout.p, cur_k, + &fft_data2[3 * (ij2 * cur_num_bands + b)], scale); + } + + /********************************************/ + + } /* end of cur_band_start loop */ } -void maxwell_target_preconditioner2(evectmatrix Xin, evectmatrix Xout, - void *data, - evectmatrix Y, real *eigenvals, - sqmatrix YtY) -{ - maxwell_target_data *d = (maxwell_target_data *) data; - maxwell_preconditioner2(Xin, Xout, d->d, Y, eigenvals, YtY); - maxwell_preconditioner2(Xout, Xout, d->d, Y, eigenvals, YtY); +void maxwell_target_preconditioner2(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, + real *eigenvals, sqmatrix YtY) { + maxwell_target_data *d = (maxwell_target_data *)data; + maxwell_preconditioner2(Xin, Xout, d->d, Y, eigenvals, YtY); + maxwell_preconditioner2(Xout, Xout, d->d, Y, eigenvals, YtY); } diff --git a/src/maxwell/xyz_loop.h b/src/maxwell/xyz_loop.h index 7b28f84f..30b637b9 100644 --- a/src/maxwell/xyz_loop.h +++ b/src/maxwell/xyz_loop.h @@ -1,64 +1,79 @@ - /* This is a semi-horrifying macro to perform we have different loops over - the coordinates, depending upon whether we are using complex or real and serial or - parallel transforms. Each loop will define, in its body, - variables (i1,i2,i3) describing the global coordinate of the current - point, and xyz_index describing the corresponding index in - the array md->eps_inv[] or similar. +/* This is a semi-horrifying macro to perform we have different loops over + the coordinates, depending upon whether we are using complex or real and serial or + parallel transforms. Each loop will define, in its body, + variables (i1,i2,i3) describing the global coordinate of the current + point, and xyz_index describing the corresponding index in + the array md->eps_inv[] or similar. - We use it like this: +We use it like this: - LOOP_XYZ(md) { // {{ implied open braces (yuck!) - body... - }}} +LOOP_XYZ(md) { // {{ implied open braces (yuck!) + body... +}}} - where md is the maxwell_data pointer. - */ +where md is the maxwell_data pointer. + */ #ifdef SCALAR_COMPLEX -# ifndef HAVE_MPI - #define LOOP_XYZ(md) { \ - int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2, i3; \ - for (i1 = 0; i1 < n1; ++i1) \ - for (i2 = 0; i2 < n2; ++i2) \ - for (i3 = 0; i3 < n3; ++i3) { \ - int xyz_index = ((i1 * n2 + i2) * n3 + i3); -# else /* HAVE_MPI */ - /* first two dimensions are transposed in MPI output: */ - #define LOOP_XYZ(md) { \ - int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2_, i3; \ - int local_n2 = md->local_ny, local_y_start = md->local_y_start; \ - for (i2_ = 0; i2_ < local_n2; ++i2_) \ - for (i1 = 0; i1 < n1; ++i1) \ - for (i3 = 0; i3 < n3; ++i3) { \ - int i2 = i2_ + local_y_start; \ - int xyz_index = ((i2_ * n1 + i1) * n3 + i3); -# endif /* HAVE_MPI */ -#else /* not SCALAR_COMPLEX */ -# ifndef HAVE_MPI - #define LOOP_XYZ(md) { \ - int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1_, i2_, i1, i2, i3; \ - int n_other = md->other_dims; \ - int n_last = md->last_dim_size / 2; \ - int rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; \ - for (i1_ = 0; i1_ < n_other; ++i1_) \ - for (i2_ = 0; i2_ < n_last; ++i2_) { \ - int xyz_index = i1_ * n_last + i2_; \ - switch (rank) { \ - case 2: i1 = i1_; i2 = i2_; i3 = 0; break; \ - case 3: i1 = i1_ / n2; i2 = i1_ % n2; i3 = i2_; break; \ - default: i1 = i2_; i2 = i3 = 0; break; \ - } +#ifndef HAVE_MPI +#define LOOP_XYZ(md) \ + { \ + int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2, i3; \ + for (i1 = 0; i1 < n1; ++i1) \ + for (i2 = 0; i2 < n2; ++i2) \ + for (i3 = 0; i3 < n3; ++i3) { \ + int xyz_index = ((i1 * n2 + i2) * n3 + i3); +#else /* HAVE_MPI */ +/* first two dimensions are transposed in MPI output: */ +#define LOOP_XYZ(md) \ + { \ + int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2_, i3; \ + int local_n2 = md->local_ny, local_y_start = md->local_y_start; \ + for (i2_ = 0; i2_ < local_n2; ++i2_) \ + for (i1 = 0; i1 < n1; ++i1) \ + for (i3 = 0; i3 < n3; ++i3) { \ + int i2 = i2_ + local_y_start; \ + int xyz_index = ((i2_ * n1 + i1) * n3 + i3); +#endif /* HAVE_MPI */ +#else /* not SCALAR_COMPLEX */ +#ifndef HAVE_MPI +#define LOOP_XYZ(md) \ + { \ + int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1_, i2_, i1, i2, i3; \ + int n_other = md->other_dims; \ + int n_last = md->last_dim_size / 2; \ + int rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; \ + for (i1_ = 0; i1_ < n_other; ++i1_) \ + for (i2_ = 0; i2_ < n_last; ++i2_) { \ + int xyz_index = i1_ * n_last + i2_; \ + switch (rank) { \ + case 2: \ + i1 = i1_; \ + i2 = i2_; \ + i3 = 0; \ + break; \ + case 3: \ + i1 = i1_ / n2; \ + i2 = i1_ % n2; \ + i3 = i2_; \ + break; \ + default: \ + i1 = i2_; \ + i2 = i3 = 0; \ + break; \ + } -# else /* HAVE_MPI */ - #define LOOP_XYZ(md) { \ - int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2_, i3; \ - int local_n2 = md->local_ny, local_y_start = md->local_y_start; \ - int local_n3 = n3 > 1 ? md->last_dim_size / 2 : 1; \ - for (i2_ = 0; i2_ < local_n2; ++i2_) \ - for (i1 = 0; i1 < n1; ++i1) \ - for (i3 = 0; i3 < local_n3; ++i3) { \ - int i2 = i2_ + local_y_start; \ - int xyz_index = ((i2_ * n1 + i1) * local_n3 + i3); -# endif /* HAVE_MPI */ +#else /* HAVE_MPI */ +#define LOOP_XYZ(md) \ + { \ + int n1 = md->nx, n2 = md->ny, n3 = md->nz, i1, i2_, i3; \ + int local_n2 = md->local_ny, local_y_start = md->local_y_start; \ + int local_n3 = n3 > 1 ? md->last_dim_size / 2 : 1; \ + for (i2_ = 0; i2_ < local_n2; ++i2_) \ + for (i1 = 0; i1 < n1; ++i1) \ + for (i3 = 0; i3 < local_n3; ++i3) { \ + int i2 = i2_ + local_y_start; \ + int xyz_index = ((i2_ * n1 + i1) * local_n3 + i3); +#endif /* HAVE_MPI */ #endif /* not SCALAR_COMPLEX */ diff --git a/src/util/check.h b/src/util/check.h index 6cf4d29f..d0547737 100644 --- a/src/util/check.h +++ b/src/util/check.h @@ -33,21 +33,21 @@ extern void check_breakpoint(void); #define CHECK_BREAKPOINT /* nothing */ #endif -#define CHECK(condition, message) do { \ - if (!(condition)) { \ - CHECK_BREAKPOINT; \ - mpi_die("CHECK failure on line %d of " __FILE__ ": " \ - message "\n", __LINE__); \ - } \ -} while (0) +#define CHECK(condition, message) \ + do { \ + if (!(condition)) { \ + CHECK_BREAKPOINT; \ + mpi_die("CHECK failure on line %d of " __FILE__ ": " message "\n", __LINE__); \ + } \ + } while (0) #endif /* not CHECK_DISABLE */ #ifdef DEBUG_MALLOC extern void *debug_malloc(size_t n); extern void debug_free(void *p); -# define malloc debug_malloc -# define free debug_free +#define malloc debug_malloc +#define free debug_free #endif /* Macro to check whether a floating-point number contains a ridiculous @@ -57,10 +57,11 @@ extern void debug_free(void *p); extern void debug_output_malloc_count(void); extern void debug_check_memory_leaks(void); -#define CHK_MALLOC(p, t, n) { \ - size_t CHK_MALLOC_n_tmp = (n); \ - (p) = (t *) malloc(sizeof(t) * CHK_MALLOC_n_tmp); \ - CHECK((p) || CHK_MALLOC_n_tmp == 0, "out of memory!"); \ -} +#define CHK_MALLOC(p, t, n) \ + { \ + size_t CHK_MALLOC_n_tmp = (n); \ + (p) = (t *)malloc(sizeof(t) * CHK_MALLOC_n_tmp); \ + CHECK((p) || CHK_MALLOC_n_tmp == 0, "out of memory!"); \ + } #endif /* CHECK_H */ diff --git a/src/util/debug_malloc.c b/src/util/debug_malloc.c index 7c872e14..80a16ebe 100644 --- a/src/util/debug_malloc.c +++ b/src/util/debug_malloc.c @@ -32,7 +32,7 @@ #include "mpi_utils.h" #include "check.h" -void* (*my_malloc_hook) (size_t) = 0; +void *(*my_malloc_hook)(size_t) = 0; /********************************************************** * DEBUGGING CODE @@ -40,18 +40,17 @@ void* (*my_malloc_hook) (size_t) = 0; #ifdef DEBUG -void check_breakpoint(void) -{ - /* this function is only here so that we can drop into a breakpoint - in a debugger when a CHECK macro fails... */ +void check_breakpoint(void) { + /* this function is only here so that we can drop into a breakpoint + in a debugger when a CHECK macro fails... */ } #endif #ifdef DEBUG_MALLOC -# undef malloc -# undef free +#undef malloc +#undef free /* * debugging malloc/free. Initialize every malloced and freed area to @@ -65,7 +64,7 @@ static int debug_malloc_cnt = 0; static int debug_malloc_total = 0; #define MAGIC 0xABadCafe -#define MMAGIC (((int) MAGIC) < 0 ? ((int) MAGIC) : -((int) MAGIC)) +#define MMAGIC (((int)MAGIC) < 0 ? ((int)MAGIC) : -((int)MAGIC)) #define PAD_FACTOR 2 #define TWOINTS (2 * sizeof(int)) @@ -77,93 +76,86 @@ static int debug_malloc_total = 0; #define WHEN_VERBOSE(a) #endif -void *debug_malloc(size_t n) -{ - char *p; - int i; +void *debug_malloc(size_t n) { + char *p; + int i; - WHEN_VERBOSE(mpi_one_fprintf(stderr,"DEBUG_MALLOC %d\n", n)); + WHEN_VERBOSE(mpi_one_fprintf(stderr, "DEBUG_MALLOC %d\n", n)); - if (n == 0) - mpi_one_fprintf(stderr, "(Allocating a block of zero size.)\n"); + if (n == 0) mpi_one_fprintf(stderr, "(Allocating a block of zero size.)\n"); - debug_malloc_total += n; + debug_malloc_total += n; - p = (char *) malloc(PAD_FACTOR * n + TWOINTS); - CHECK(p, "debug_malloc: out of memory\n"); + p = (char *)malloc(PAD_FACTOR * n + TWOINTS); + CHECK(p, "debug_malloc: out of memory\n"); - /* store the size in a known position */ - ((int *) p)[0] = n; - ((int *) p)[1] = MAGIC; - for (i = 0; i < PAD_FACTOR * n; ++i) - p[i + TWOINTS] = (char) (i ^ 0xDEADBEEF); + /* store the size in a known position */ + ((int *)p)[0] = n; + ((int *)p)[1] = MAGIC; + for (i = 0; i < PAD_FACTOR * n; ++i) + p[i + TWOINTS] = (char)(i ^ 0xDEADBEEF); - ++debug_malloc_cnt; + ++debug_malloc_cnt; - /* skip the size we stored previously */ - return (void *) (p + TWOINTS); + /* skip the size we stored previously */ + return (void *)(p + TWOINTS); } -void debug_free(void *p) -{ - char *q = ((char *) p) - TWOINTS; +void debug_free(void *p) { + char *q = ((char *)p) - TWOINTS; - CHECK(p, "debug_free: tried to free NULL pointer!\n"); - CHECK(q, "debug_free: tried to free NULL+TWOINTS pointer!\n"); + CHECK(p, "debug_free: tried to free NULL pointer!\n"); + CHECK(q, "debug_free: tried to free NULL+TWOINTS pointer!\n"); - { - int n = ((int *) q)[0]; - int magic = ((int *) q)[1]; - int i; + { + int n = ((int *)q)[0]; + int magic = ((int *)q)[1]; + int i; - WHEN_VERBOSE(mpi_one_fprintf(stderr,"DEBUG_FREE %d\n", n)); - CHECK(n != MMAGIC, "Tried to free a freed pointer!\n"); - *((int *) q) = MMAGIC; /* to detect duplicate free's */ + WHEN_VERBOSE(mpi_one_fprintf(stderr, "DEBUG_FREE %d\n", n)); + CHECK(n != MMAGIC, "Tried to free a freed pointer!\n"); + *((int *)q) = MMAGIC; /* to detect duplicate free's */ - CHECK(magic == MAGIC, "Wrong magic in debug_free()!\n"); - ((int *) q)[1] = ~MAGIC; + CHECK(magic == MAGIC, "Wrong magic in debug_free()!\n"); + ((int *)q)[1] = ~MAGIC; - CHECK(n >= 0, "Tried to free block with corrupt size descriptor!\n"); + CHECK(n >= 0, "Tried to free block with corrupt size descriptor!\n"); - debug_malloc_total -= n; + debug_malloc_total -= n; - CHECK(debug_malloc_total >= 0, - "debug_malloc_total went negative!\n"); + CHECK(debug_malloc_total >= 0, "debug_malloc_total went negative!\n"); - /* check for writing past end of array: */ - for (i = n; i < PAD_FACTOR * n; ++i) - if (q[i + TWOINTS] != (char) (i ^ 0xDEADBEEF)) - mpi_die("Byte %d past end of array has changed!\n" - "Array bounds overwritten!\n", - i - n + 1); - for (i = 0; i < PAD_FACTOR * n; ++i) - q[i + TWOINTS] = (char) (i ^ 0xBEEFDEAD); + /* check for writing past end of array: */ + for (i = n; i < PAD_FACTOR * n; ++i) + if (q[i + TWOINTS] != (char)(i ^ 0xDEADBEEF)) + mpi_die("Byte %d past end of array has changed!\n" + "Array bounds overwritten!\n", + i - n + 1); + for (i = 0; i < PAD_FACTOR * n; ++i) + q[i + TWOINTS] = (char)(i ^ 0xBEEFDEAD); - --debug_malloc_cnt; - free(q); - } + --debug_malloc_cnt; + free(q); + } } #endif /* DEBUG */ /* output current memory usage: */ -void debug_output_malloc_count(void) -{ +void debug_output_malloc_count(void) { #ifdef DEBUG_MALLOC - mpi_one_fprintf(stderr, "malloc: %d blocks, %g kB\n", - debug_malloc_cnt, debug_malloc_total / 1024.0); + mpi_one_fprintf(stderr, "malloc: %d blocks, %g kB\n", debug_malloc_cnt, + debug_malloc_total / 1024.0); #endif } /* check for memory leaks when debugging */ -void debug_check_memory_leaks(void) -{ +void debug_check_memory_leaks(void) { #ifdef DEBUG_MALLOC - if (debug_malloc_cnt || debug_malloc_total) - mpi_die("MEMORY LEAK!!!\n" - "number of unbalanced malloc calls = %d\n" - "total leaked bytes = %d\n", - debug_malloc_cnt, - debug_malloc_total); + if (debug_malloc_cnt || debug_malloc_total) + mpi_die("MEMORY LEAK!!!\n" + "number of unbalanced malloc calls = %d\n" + "total leaked bytes = %d\n", + debug_malloc_cnt, debug_malloc_total); #endif } diff --git a/src/util/mpi_utils.c b/src/util/mpi_utils.c index 75180f54..73979e71 100644 --- a/src/util/mpi_utils.c +++ b/src/util/mpi_utils.c @@ -16,7 +16,7 @@ */ #if !defined(_GNU_SOURCE) - #define _GNU_SOURCE +#define _GNU_SOURCE #endif #include @@ -61,125 +61,116 @@ int mpb_comm = 0; static MPI_Comm mpb_comm_save = MPI_COMM_WORLD; #endif -void end_divide_parallel(void) -{ +void end_divide_parallel(void) { #ifdef HAVE_MPI - if (mpb_comm != MPI_COMM_WORLD) MPI_Comm_free(&mpb_comm); - if (mpb_comm_save != MPI_COMM_WORLD) MPI_Comm_free(&mpb_comm_save); - mpb_comm = mpb_comm_save = MPI_COMM_WORLD; + if (mpb_comm != MPI_COMM_WORLD) MPI_Comm_free(&mpb_comm); + if (mpb_comm_save != MPI_COMM_WORLD) MPI_Comm_free(&mpb_comm_save); + mpb_comm = mpb_comm_save = MPI_COMM_WORLD; #endif } -int divide_parallel_processes(int numgroups) -{ +int divide_parallel_processes(int numgroups) { #ifdef HAVE_MPI - int sz, rank, mygroup; - end_divide_parallel(); - MPI_Comm_size(mpb_comm, &sz); - CHECK(numgroups > 0, "numgroups must be > 0"); - CHECK(numgroups <= sz, "tried to split into more groups than processes"); - MPI_Comm_rank(mpb_comm, &rank); - mygroup = (rank * numgroups) / sz; - MPI_Comm_split(MPI_COMM_WORLD, mygroup, rank, &mpb_comm); - return mygroup; + int sz, rank, mygroup; + end_divide_parallel(); + MPI_Comm_size(mpb_comm, &sz); + CHECK(numgroups > 0, "numgroups must be > 0"); + CHECK(numgroups <= sz, "tried to split into more groups than processes"); + MPI_Comm_rank(mpb_comm, &rank); + mygroup = (rank * numgroups) / sz; + MPI_Comm_split(MPI_COMM_WORLD, mygroup, rank, &mpb_comm); + return mygroup; #else - CHECK(numgroups != 1, "tried to split into more groups than processes"); - return 0; + CHECK(numgroups != 1, "tried to split into more groups than processes"); + return 0; #endif } -void begin_global_communications(void) -{ +void begin_global_communications(void) { #ifdef HAVE_MPI - mpb_comm_save = mpb_comm; - mpb_comm = MPI_COMM_WORLD; + mpb_comm_save = mpb_comm; + mpb_comm = MPI_COMM_WORLD; #endif } -void end_global_communications(void) -{ +void end_global_communications(void) { #ifdef HAVE_MPI - mpb_comm = mpb_comm_save; - mpb_comm_save = MPI_COMM_WORLD; + mpb_comm = mpb_comm_save; + mpb_comm_save = MPI_COMM_WORLD; #endif } int my_global_rank() { #ifdef HAVE_MPI - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - return rank; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + return rank; #else - return 0; + return 0; #endif } /* die when fatal errors occur */ -void mpi_die(const char *template, ...) -{ - va_list ap; +void mpi_die(const char *template, ...) { + va_list ap; - va_start(ap, template); - vfprintf(stderr, template, ap); - va_end(ap); + va_start(ap, template); + vfprintf(stderr, template, ap); + va_end(ap); - MPI_Abort(mpb_comm, EXIT_FAILURE); + MPI_Abort(mpb_comm, EXIT_FAILURE); } void (*mpb_printf_callback)(const char *s) = NULL; /* Like printf, except only does anything on master process. */ -void mpi_one_printf(const char *template, ...) -{ - if (mpi_is_master()) { - va_list ap; - va_start(ap, template); - if (mpb_printf_callback) { - char *s; - vasprintf(&s, template, ap); - mpb_printf_callback(s); - free(s); - } - else { - vprintf(template, ap); - } - va_end(ap); - fflush(stdout); - } +void mpi_one_printf(const char *template, ...) { + if (mpi_is_master()) { + va_list ap; + va_start(ap, template); + if (mpb_printf_callback) { + char *s; + vasprintf(&s, template, ap); + mpb_printf_callback(s); + free(s); + } + else { + vprintf(template, ap); + } + va_end(ap); + fflush(stdout); + } } /* Like fprintf, except only does anything on master process. */ -void mpi_one_fprintf(FILE *f, const char *template, ...) -{ - if (mpi_is_master()) { - va_list ap; - va_start(ap, template); - vfprintf(f, template, ap); - va_end(ap); - fflush(f); - } +void mpi_one_fprintf(FILE *f, const char *template, ...) { + if (mpi_is_master()) { + va_list ap; + va_start(ap, template); + vfprintf(f, template, ap); + va_end(ap); + fflush(f); + } } /* Return whether we are the master process (rank == 0). */ -int mpi_is_master(void) -{ - int process_rank; - MPI_Comm_rank(mpb_comm, &process_rank); - return (process_rank == 0); +int mpi_is_master(void) { + int process_rank; + MPI_Comm_rank(mpb_comm, &process_rank); + return (process_rank == 0); } /* When debugging, checks to see that x is the same over all processes, and abort the program if it is not. */ -void mpi_assert_equal(double x) -{ +void mpi_assert_equal(double x) { #ifdef DEBUG - double xmin, xmax; + double xmin, xmax; - mpi_allreduce(&x, &xmin, 1, double, MPI_DOUBLE, MPI_MIN, mpb_comm); - mpi_allreduce(&x, &xmax, 1, double, MPI_DOUBLE, MPI_MAX, mpb_comm); - CHECK(xmin == x && xmax == x, "mpi_assert_equal failure"); + mpi_allreduce(&x, &xmin, 1, double, MPI_DOUBLE, MPI_MIN, mpb_comm); + mpi_allreduce(&x, &xmax, 1, double, MPI_DOUBLE, MPI_MAX, mpb_comm); + CHECK(xmin == x && xmax == x, "mpi_assert_equal failure"); #else - (void) x; /* unused */ + (void)x; /* unused */ #endif } @@ -187,33 +178,29 @@ void mpi_assert_equal(double x) of code that should be executed by only one process at a time. They work by having each process wait for a message from the - previous process before starting. + previous process before starting. Each critical section is passed an integer "tag"...ideally, this should be a unique identifier for each critical section so that messages from different critical sections don't get mixed up somehow. */ -void mpi_begin_critical_section(int tag) -{ - int process_rank; - MPI_Comm_rank(mpb_comm, &process_rank); - if (process_rank > 0) { /* wait for a message before continuing */ - MPI_Status status; - int recv_tag = tag - 1; /* initialize to wrong value */ - MPI_Recv(&recv_tag, 1, MPI_INT, process_rank - 1, tag, - mpb_comm, &status); - CHECK(recv_tag == tag, "invalid tag received"); - } +void mpi_begin_critical_section(int tag) { + int process_rank; + MPI_Comm_rank(mpb_comm, &process_rank); + if (process_rank > 0) { /* wait for a message before continuing */ + MPI_Status status; + int recv_tag = tag - 1; /* initialize to wrong value */ + MPI_Recv(&recv_tag, 1, MPI_INT, process_rank - 1, tag, mpb_comm, &status); + CHECK(recv_tag == tag, "invalid tag received"); + } } -void mpi_end_critical_section(int tag) -{ - int process_rank, num_procs; - MPI_Comm_rank(mpb_comm, &process_rank); - MPI_Comm_size(mpb_comm, &num_procs); - if (process_rank != num_procs - 1) { /* send a message to next process */ - MPI_Send(&tag, 1, MPI_INT, process_rank + 1, tag, - mpb_comm); - } +void mpi_end_critical_section(int tag) { + int process_rank, num_procs; + MPI_Comm_rank(mpb_comm, &process_rank); + MPI_Comm_size(mpb_comm, &num_procs); + if (process_rank != num_procs - 1) { /* send a message to next process */ + MPI_Send(&tag, 1, MPI_INT, process_rank + 1, tag, mpb_comm); + } } diff --git a/src/util/mpi_utils.h b/src/util/mpi_utils.h index dd1877e9..786f99b4 100644 --- a/src/util/mpi_utils.h +++ b/src/util/mpi_utils.h @@ -21,7 +21,7 @@ #include #ifdef HAVE_MPI -# include +#include extern MPI_Comm mpb_comm; #else extern int mpb_comm; @@ -35,21 +35,21 @@ extern int my_global_rank(); extern void mpi_die(const char *template, ...) #ifdef __GNUC__ - __attribute__ ((format (printf, 1, 2))) + __attribute__((format(printf, 1, 2))) #endif -; + ; extern void mpi_one_fprintf(FILE *f, const char *template, ...) #ifdef __GNUC__ - __attribute__ ((format (printf, 2, 3))) + __attribute__((format(printf, 2, 3))) #endif -; + ; extern void mpi_one_printf(const char *template, ...) #ifdef __GNUC__ - __attribute__ ((format (printf, 1, 2))) + __attribute__((format(printf, 1, 2))) #endif -; + ; extern int mpi_is_master(void); @@ -59,9 +59,10 @@ extern void mpi_begin_critical_section(int tag); extern void mpi_end_critical_section(int tag); /* "in-place" Allreduce wrapper for reducing a single value */ -#define mpi_allreduce_1(b, ctype, t, op, comm) { \ - ctype bbbb = *(b); \ - mpi_allreduce(&bbbb, (b), 1, ctype, t, op, comm); \ -} +#define mpi_allreduce_1(b, ctype, t, op, comm) \ + { \ + ctype bbbb = *(b); \ + mpi_allreduce(&bbbb, (b), 1, ctype, t, op, comm); \ + } #endif /* MPI_UTILS_H */ diff --git a/src/util/mpiglue.h b/src/util/mpiglue.h index ea742891..d83e7b8e 100644 --- a/src/util/mpiglue.h +++ b/src/util/mpiglue.h @@ -19,7 +19,7 @@ #define MPIGLUE_H /* This header file stands between our code and MPI. If MPI is present, - we just #include . Otherwise, we use no-op stubs for + we just #include . Otherwise, we use no-op stubs for MPI routines. */ #ifdef HAVE_MPI @@ -30,8 +30,7 @@ typedef double mpiglue_clock_t; #define MPIGLUE_CLOCK MPI_Wtime() #define MPIGLUE_CLOCK_DIFF(t2, t1) ((t2) - (t1)) -#define mpi_allreduce(sb, rb, n, ctype, t, op, comm) \ - MPI_Allreduce(sb,rb,n,t,op,comm) +#define mpi_allreduce(sb, rb, n, ctype, t, op, comm) MPI_Allreduce(sb, rb, n, t, op, comm) #else /* don't have MPI */ @@ -39,7 +38,7 @@ typedef double mpiglue_clock_t; #include #include -#define MPI_Init(argc,argv) 0 +#define MPI_Init(argc, argv) 0 #define MPI_Finalize() 0 /* Stub for the MPI_Allreduce function, differing only by the addition @@ -50,10 +49,11 @@ typedef double mpiglue_clock_t; but it seems that MPI implementations do not allow this. The MPI 2.0 specification supports an MPI_IN_PLACE constant that you can pass for sb in order to be in-place, but I don't want to require that. */ -#define mpi_allreduce(sb, rb, n, ctype, t, op, comm) { \ - CHECK((sb) != (rb), "MPI_Allreduce doesn't work for sendbuf == recvbuf");\ - memcpy((rb), (sb), (n) * sizeof(ctype)); \ -} +#define mpi_allreduce(sb, rb, n, ctype, t, op, comm) \ + { \ + CHECK((sb) != (rb), "MPI_Allreduce doesn't work for sendbuf == recvbuf"); \ + memcpy((rb), (sb), (n) * sizeof(ctype)); \ + } #define MPI_Bcast(b, n, t, root, comm) 0 @@ -64,11 +64,9 @@ typedef double mpiglue_clock_t; #define MPI_Comm_rank(comm, rankp) *(rankp) = 0 #define MPI_Comm_size(comm, sizep) *(sizep) = 1 -#define MPI_Send(sb,n,t, r,tag, comm) \ -CHECK(0, "MPI_Send stub is non-functional"); +#define MPI_Send(sb, n, t, r, tag, comm) CHECK(0, "MPI_Send stub is non-functional"); -#define MPI_Recv(sb,n,t, r,tag, comm,statp) \ -CHECK(0, "MPI_Recv stub is non-functional"); +#define MPI_Recv(sb, n, t, r, tag, comm, statp) CHECK(0, "MPI_Recv stub is non-functional"); typedef int mpiglue_status_t; #define MPI_Status mpiglue_status_t diff --git a/src/util/sphere-quad.c b/src/util/sphere-quad.c index 381aa048..f9217fb3 100644 --- a/src/util/sphere-quad.c +++ b/src/util/sphere-quad.c @@ -23,153 +23,181 @@ #include #include -#define SHIFT3(x,y,z) {double SHIFT3_dummy = z; z = y; y = x; x = SHIFT3_dummy;} +#define SHIFT3(x, y, z) \ + { \ + double SHIFT3_dummy = z; \ + z = y; \ + y = x; \ + x = SHIFT3_dummy; \ + } -#define CHECK(condition, message) do { \ - if (!(condition)) { \ - fprintf(stderr, "CHECK failure on line %d of " __FILE__ ": " \ - message "\n", __LINE__); exit(EXIT_FAILURE); \ - } \ -} while (0) +#define CHECK(condition, message) \ + do { \ + if (!(condition)) { \ + fprintf(stderr, "CHECK failure on line %d of " __FILE__ ": " message "\n", __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) /* Compute quadrature points and weights for integrating on the unit sphere. x, y, z, and weight should be arrays of num_sq_pts values to hold the coordinates and weights of the quadrature points on output. Currently, num_sq_pts = 12, 50, and 72 are supported. */ -void spherical_quadrature_points(double *x, double *y, double *z, - double *weight, int num_sq_pts) -{ - int i,j,k,l, n = 0; - double x0, y0, z0, w; - - if (num_sq_pts == 50) { - /* Computes quadrature points and weights for 50-point 11th degree - integration formula on a unit sphere. This particular quadrature - formula has the advantage, for our purposes, of preserving the - symmetry group of an octahedron (i.e. simple cubic symmetry, with - respect to the Cartesian xyz axes). References: - - A. D. McLaren, "Optimal Numerical Integration on a Sphere," - Math. Comp. 17, pp. 361-383 (1963). - - Also in: Arthur H. Stroud, "Approximate Calculation of Multiple - Integrals" (Prentice Hall, 1971) (formula number U3:11-1). - - This code was written with the help of example code by - John Burkardt: - http://www.psc.edu/~burkardt/src_pt/stroud/stroud.html */ - - x0 = 1; y0 = z0 = 0; - w = 9216 / 725760.0; - for (i = 0; i < 2; ++i) { - x0 = -x0; - for (j = 0; j < 3; ++j) { - SHIFT3(x0,y0,z0); - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - - x0 = y0 = sqrt(0.5); z0 = 0; - w = 16384 / 725760.0; - for (i = 0; i < 2; ++i) { - x0 = -x0; - for (j = 0; j < 2; ++j) { - y0 = -y0; - for (k = 0; k < 3; ++k) { - SHIFT3(x0,y0,z0); - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - } - - x0 = y0 = z0 = sqrt(1.0 / 3.0); - w = 15309 / 725760.0; - for (i = 0; i < 2; ++i) { - x0 = -x0; - for (j = 0; j < 2; ++j) { - y0 = -y0; - for (k = 0; k < 2; ++k) { - z0 = -z0; - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - } - - x0 = y0 = sqrt(1.0 / 11.0); z0 = 3 * x0; - w = 14641 / 725760.0; - for (i = 0; i < 2; ++i) { - x0 = -x0; - for (j = 0; j < 2; ++j) { - y0 = -y0; - for (k = 0; k < 2; ++k) { - z0 = -z0; - for (l = 0; l < 3; ++l) { - SHIFT3(x0,y0,z0); - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - } - } - } - else if (num_sq_pts == 72 || num_sq_pts == 12) { - /* As above (same references), but with a 72-point 14th degree - formula, this time with the symmetry group of an icosohedron. - (Stroud formula number U3:14-1.) Alternatively, just use - the 12-point 5th degree formula consisting of the vertices - of a regular icosohedron. */ - - /* first, the vertices of an icosohedron: */ - x0 = sqrt(0.5 - sqrt(0.05)); - y0 = sqrt(0.5 + sqrt(0.05)); - z0 = 0; - if (num_sq_pts == 72) - w = 125 / 10080.0; - else - w = 1 / 12.0; - for (i = 0; i < 2; ++i) { - x0 = -x0; - for (j = 0; j < 2; ++j) { - y0 = -y0; - for (k = 0; k < 3; ++k) { - SHIFT3(x0,y0,z0); - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - } - - if (num_sq_pts == 72) { - /* it would be nice, for completeness, to have more - digits here: */ - double coords[3][5] = { - { -0.151108275, 0.315838353, 0.346307112, -0.101808787, -0.409228403 }, - { 0.155240600, 0.257049387, 0.666277790, 0.817386065, 0.501547712 }, - { 0.976251323, 0.913330032, 0.660412970, 0.567022920, 0.762221757 } - }; - - w = 143 / 10080.0; - for (l = 0; l < 5; ++l) { - x0 = coords[0][l]; y0 = coords[1][l]; z0 = coords[2][l]; - for (i = 0; i < 3; ++i) { - double dummy = x0; - x0 = z0; - z0 = -y0; - y0 = -dummy; - for (j = 0; j < 3; ++j) { - SHIFT3(x0,y0,z0); - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - y0 = -y0; - z0 = -z0; - x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w; - } - } - } - } - else - CHECK(0, "spherical_quadrature_points: passed unknown # points!"); - - CHECK(n == num_sq_pts, - "bug in spherical_quadrature_points: wrong number of points!"); +void spherical_quadrature_points(double *x, double *y, double *z, double *weight, int num_sq_pts) { + int i, j, k, l, n = 0; + double x0, y0, z0, w; + + if (num_sq_pts == 50) { + /* Computes quadrature points and weights for 50-point 11th degree + integration formula on a unit sphere. This particular quadrature + formula has the advantage, for our purposes, of preserving the + symmetry group of an octahedron (i.e. simple cubic symmetry, with + respect to the Cartesian xyz axes). References: + + A. D. McLaren, "Optimal Numerical Integration on a Sphere," + Math. Comp. 17, pp. 361-383 (1963). + + Also in: Arthur H. Stroud, "Approximate Calculation of Multiple + Integrals" (Prentice Hall, 1971) (formula number U3:11-1). + + This code was written with the help of example code by + John Burkardt: + http://www.psc.edu/~burkardt/src_pt/stroud/stroud.html */ + + x0 = 1; + y0 = z0 = 0; + w = 9216 / 725760.0; + for (i = 0; i < 2; ++i) { + x0 = -x0; + for (j = 0; j < 3; ++j) { + SHIFT3(x0, y0, z0); + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + + x0 = y0 = sqrt(0.5); + z0 = 0; + w = 16384 / 725760.0; + for (i = 0; i < 2; ++i) { + x0 = -x0; + for (j = 0; j < 2; ++j) { + y0 = -y0; + for (k = 0; k < 3; ++k) { + SHIFT3(x0, y0, z0); + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + } + + x0 = y0 = z0 = sqrt(1.0 / 3.0); + w = 15309 / 725760.0; + for (i = 0; i < 2; ++i) { + x0 = -x0; + for (j = 0; j < 2; ++j) { + y0 = -y0; + for (k = 0; k < 2; ++k) { + z0 = -z0; + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + } + + x0 = y0 = sqrt(1.0 / 11.0); + z0 = 3 * x0; + w = 14641 / 725760.0; + for (i = 0; i < 2; ++i) { + x0 = -x0; + for (j = 0; j < 2; ++j) { + y0 = -y0; + for (k = 0; k < 2; ++k) { + z0 = -z0; + for (l = 0; l < 3; ++l) { + SHIFT3(x0, y0, z0); + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + } + } + } + else if (num_sq_pts == 72 || num_sq_pts == 12) { + /* As above (same references), but with a 72-point 14th degree + formula, this time with the symmetry group of an icosohedron. + (Stroud formula number U3:14-1.) Alternatively, just use + the 12-point 5th degree formula consisting of the vertices + of a regular icosohedron. */ + + /* first, the vertices of an icosohedron: */ + x0 = sqrt(0.5 - sqrt(0.05)); + y0 = sqrt(0.5 + sqrt(0.05)); + z0 = 0; + if (num_sq_pts == 72) + w = 125 / 10080.0; + else + w = 1 / 12.0; + for (i = 0; i < 2; ++i) { + x0 = -x0; + for (j = 0; j < 2; ++j) { + y0 = -y0; + for (k = 0; k < 3; ++k) { + SHIFT3(x0, y0, z0); + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + } + + if (num_sq_pts == 72) { + /* it would be nice, for completeness, to have more + digits here: */ + double coords[3][5] = {{-0.151108275, 0.315838353, 0.346307112, -0.101808787, -0.409228403}, + {0.155240600, 0.257049387, 0.666277790, 0.817386065, 0.501547712}, + {0.976251323, 0.913330032, 0.660412970, 0.567022920, 0.762221757}}; + + w = 143 / 10080.0; + for (l = 0; l < 5; ++l) { + x0 = coords[0][l]; + y0 = coords[1][l]; + z0 = coords[2][l]; + for (i = 0; i < 3; ++i) { + double dummy = x0; + x0 = z0; + z0 = -y0; + y0 = -dummy; + for (j = 0; j < 3; ++j) { + SHIFT3(x0, y0, z0); + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + y0 = -y0; + z0 = -z0; + x[n] = x0; + y[n] = y0; + z[n] = z0; + weight[n++] = w; + } + } + } + } + else + CHECK(0, "spherical_quadrature_points: passed unknown # points!"); + + CHECK(n == num_sq_pts, "bug in spherical_quadrature_points: wrong number of points!"); } #define NQUAD3 50 /* use 50-point quadrature formula by default */ @@ -180,17 +208,14 @@ void spherical_quadrature_points(double *x, double *y, double *z, /**********************************************************************/ double sqr(double x) { return x * x; } -double dist2(double x1, double y1, double z1, - double x2, double y2, double z2) -{ - return sqr(x1-x2) + sqr(y1-y2) + sqr(z1-z2); +double dist2(double x1, double y1, double z1, double x2, double y2, double z2) { + return sqr(x1 - x2) + sqr(y1 - y2) + sqr(z1 - z2); } double min2(double a, double b) { return a < b ? a : b; } /* sort the array to maximize the spacing of each point with the previous points */ -void sort_by_distance(int n, double x[], double y[], double z[], double w[]) -{ +void sort_by_distance(int n, double x[], double y[], double z[], double w[]) { int i, j, k; for (i = 1; i < n; ++i) { double xi, yi, zi, wi; @@ -200,76 +225,88 @@ void sort_by_distance(int n, double x[], double y[], double z[], double w[]) for (j = i; j < n; ++j) { double d2min = 1e20, d2sum = 0; for (k = 0; k < i; ++k) { - double d2 = (float) (dist2(x[k],y[k],z[k], x[j],y[j],z[j])); - d2min = min2(d2min, d2); - d2sum += d2; + double d2 = (float)(dist2(x[k], y[k], z[k], x[j], y[j], z[j])); + d2min = min2(d2min, d2); + d2sum += d2; } - if (d2min > d2max || - (d2min == d2max && d2sum > d2maxsum)) { - d2max = d2min; - d2maxsum = d2sum; - jmax = j; + if (d2min > d2max || (d2min == d2max && d2sum > d2maxsum)) { + d2max = d2min; + d2maxsum = d2sum; + jmax = j; } } xi = x[i], yi = y[i], zi = z[i], wi = w[i]; - x[i] = x[jmax]; y[i] = y[jmax]; z[i] = z[jmax]; w[i] = w[jmax]; - x[jmax] = xi; y[jmax] = yi; z[jmax] = zi; w[jmax] = wi; + x[i] = x[jmax]; + y[i] = y[jmax]; + z[i] = z[jmax]; + w[i] = w[jmax]; + x[jmax] = xi; + y[jmax] = yi; + z[jmax] = zi; + w[jmax] = wi; } } -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - -int main(void) -{ - int i; - double x2[NQUAD2], y2[NQUAD2], z2[NQUAD2], w2[NQUAD2]; - double x3[NQUAD3], y3[NQUAD3], z3[NQUAD3], w3[NQUAD3]; - - printf( -"/* For 1d, 2d, and 3d, quadrature points and weights on a unit sphere.\n" -" There are num_sphere_quad[dim-1] points i, with the i-th point at\n" -" (x,y,z) = (sphere_quad[dim-1][i][ 0, 1, 2 ]), and with a quadrature\n" -" weight sphere_quad[dim-1][i][3]. */\n\n"); - - printf("#define NQUAD %d\n", MAX(NQUAD3, MAX(2, NQUAD2))); - - printf("static const int num_sphere_quad[3] = { %d, %d, %d };\n\n", - 2, NQUAD2, NQUAD3); - - printf("static const double sphere_quad[3][NQUAD][4] = {\n"); - - printf(" { {1,0,0,0.5}, {-1,0,0,0.5} },\n"); - - for (i = 0; i < NQUAD2; ++i) { - if (i == 0) { x2[i] = 1; y2[i] = 0; } - else if (2*i == NQUAD2) { x2[i] = -1; y2[i] = 0; } - else if (4*i == NQUAD2) { y2[i] = 1; x2[i] = 0; } - else if (4*i == 3*NQUAD2) { y2[i] = -1; x2[i] = 0; } - else { - x2[i] = cos(2*i * K_PI / NQUAD2); - y2[i] = sin(2*i * K_PI / NQUAD2); - } - z2[i] = 0.0; - w2[i] = 1.0 / NQUAD2; - } - sort_by_distance(NQUAD2,x2,y2,z2,w2); - printf(" {\n"); - for (i = 0; i < NQUAD2; ++i) { - printf(" { %0.20g, %0.20g, %0.20g, %0.20g },\n", - x2[i], y2[i], z2[i], w2[i]); - } - printf(" },\n"); - - printf(" {\n"); - spherical_quadrature_points(x3,y3,z3, w3, NQUAD3); - sort_by_distance(NQUAD3,x3,y3,z3,w3); - for (i = 0; i < NQUAD3; ++i) { - printf(" { %0.20g, %0.20g, %0.20g, %0.20g },\n", - x3[i], y3[i], z3[i], w3[i]); - } - printf(" }\n"); - - printf("};\n"); - - return 0; +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +int main(void) { + int i; + double x2[NQUAD2], y2[NQUAD2], z2[NQUAD2], w2[NQUAD2]; + double x3[NQUAD3], y3[NQUAD3], z3[NQUAD3], w3[NQUAD3]; + + printf("/* For 1d, 2d, and 3d, quadrature points and weights on a unit sphere.\n" + " There are num_sphere_quad[dim-1] points i, with the i-th point at\n" + " (x,y,z) = (sphere_quad[dim-1][i][ 0, 1, 2 ]), and with a quadrature\n" + " weight sphere_quad[dim-1][i][3]. */\n\n"); + + printf("#define NQUAD %d\n", MAX(NQUAD3, MAX(2, NQUAD2))); + + printf("static const int num_sphere_quad[3] = { %d, %d, %d };\n\n", 2, NQUAD2, NQUAD3); + + printf("static const double sphere_quad[3][NQUAD][4] = {\n"); + + printf(" { {1,0,0,0.5}, {-1,0,0,0.5} },\n"); + + for (i = 0; i < NQUAD2; ++i) { + if (i == 0) { + x2[i] = 1; + y2[i] = 0; + } + else if (2 * i == NQUAD2) { + x2[i] = -1; + y2[i] = 0; + } + else if (4 * i == NQUAD2) { + y2[i] = 1; + x2[i] = 0; + } + else if (4 * i == 3 * NQUAD2) { + y2[i] = -1; + x2[i] = 0; + } + else { + x2[i] = cos(2 * i * K_PI / NQUAD2); + y2[i] = sin(2 * i * K_PI / NQUAD2); + } + z2[i] = 0.0; + w2[i] = 1.0 / NQUAD2; + } + sort_by_distance(NQUAD2, x2, y2, z2, w2); + printf(" {\n"); + for (i = 0; i < NQUAD2; ++i) { + printf(" { %0.20g, %0.20g, %0.20g, %0.20g },\n", x2[i], y2[i], z2[i], w2[i]); + } + printf(" },\n"); + + printf(" {\n"); + spherical_quadrature_points(x3, y3, z3, w3, NQUAD3); + sort_by_distance(NQUAD3, x3, y3, z3, w3); + for (i = 0; i < NQUAD3; ++i) { + printf(" { %0.20g, %0.20g, %0.20g, %0.20g },\n", x3[i], y3[i], z3[i], w3[i]); + } + printf(" }\n"); + + printf("};\n"); + + return 0; } diff --git a/tests/blastest.c b/tests/blastest.c index 3d518c08..4f14163c 100644 --- a/tests/blastest.c +++ b/tests/blastest.c @@ -23,35 +23,33 @@ #include #include -extern void debug_check_memory_leaks(void); +extern void debug_check_memory_leaks(void); -void printmat(scalar *A, int m, int n) -{ +void printmat(scalar *A, int m, int n) { int i, j; for (i = 0; i < m; ++i) { for (j = 0; j < n; ++j) { #ifdef SCALAR_COMPLEX - printf(" (%6.3f,%6.3f)", (double)A[i*n + j].re, (double)A[i*n + j].im); + printf(" (%6.3f,%6.3f)", (double)A[i * n + j].re, (double)A[i * n + j].im); #else - printf(" %6.3f", (double)A[i*n + j]); + printf(" %6.3f", (double)A[i * n + j]); #endif } printf("\n"); } } -void printmat_matlab(scalar *A, int m, int n) -{ +void printmat_matlab(scalar *A, int m, int n) { int i, j; printf("["); for (i = 0; i < m; ++i) { for (j = 0; j < n; ++j) { #ifdef SCALAR_COMPLEX - printf(" %g+%gi", (double)A[i*n + j].re, (double)A[i*n + j].im); + printf(" %g+%gi", (double)A[i * n + j].re, (double)A[i * n + j].im); #else - printf(" %g", (double)A[i*n + j]); + printf(" %g", (double)A[i * n + j]); #endif } printf(";\n"); @@ -59,42 +57,35 @@ void printmat_matlab(scalar *A, int m, int n) printf("]\n"); } -int main(void) -{ +int main(void) { const int N = 4; - int i,j; + int i, j; #ifndef SCALAR_COMPLEX - scalar A[] = { 3.3, 6.2, 7.1, 9.1, - -2.3, 3.6, 0.3, 9.7, - 6.7, -0.1, 1.1, 4.8, - 8.4, 7.7, 5.9, -1.8 }; - scalar B[] = { 1.1, 2.2, 3.3, 4.4, - 8.8, 7.7, 6.6, 5.5, - 6.1, 8.2, 9.7, 3.6, - 6.3, 2.9, 5.5, 8.1 }; + scalar A[] = {3.3, 6.2, 7.1, 9.1, -2.3, 3.6, 0.3, 9.7, 6.7, -0.1, 1.1, 4.8, 8.4, 7.7, 5.9, -1.8}; + scalar B[] = {1.1, 2.2, 3.3, 4.4, 8.8, 7.7, 6.6, 5.5, 6.1, 8.2, 9.7, 3.6, 6.3, 2.9, 5.5, 8.1}; #else - scalar A[] = { {3.3, 6.2} , {7.1, 9.1}, {2.3, 8.2}, {-3.2, 6.6}, - {-2.3, 3.6}, {0.3, 9.7}, {1.9,-4.9}, {7.1, 7.1}, - {6.7, -0.1}, {1.1, 4.8}, {-9.7, 3.7}, {-0.01, -0.2}, - {8.4, 7.7}, {5.9, -1.8}, {8.8, 9.9}, {0.0, 0.1} }; - scalar B[] = { {1.1, 2.2}, {3.3, 4.4}, {1.2, 2.3}, {3.4, 4.5}, - {8.8, 7.7}, {6.6, 5.5}, {3.5, 7.2}, {-0.3, 6.1}, - {6.1, 8.2}, {9.7, 3.6}, {-5.1, 6.1}, {2.3, 8.1}, - {6.3, 2.9}, {5.5, 8.1}, {8.5, 6.7}, {9.0, 2.4} }; + scalar A[] = {{3.3, 6.2}, {7.1, 9.1}, {2.3, 8.2}, {-3.2, 6.6}, {-2.3, 3.6}, {0.3, 9.7}, + {1.9, -4.9}, {7.1, 7.1}, {6.7, -0.1}, {1.1, 4.8}, {-9.7, 3.7}, {-0.01, -0.2}, + {8.4, 7.7}, {5.9, -1.8}, {8.8, 9.9}, {0.0, 0.1}}; + scalar B[] = {{1.1, 2.2}, {3.3, 4.4}, {1.2, 2.3}, {3.4, 4.5}, {8.8, 7.7}, {6.6, 5.5}, + {3.5, 7.2}, {-0.3, 6.1}, {6.1, 8.2}, {9.7, 3.6}, {-5.1, 6.1}, {2.3, 8.1}, + {6.3, 2.9}, {5.5, 8.1}, {8.5, 6.7}, {9.0, 2.4}}; #endif scalar C[16], D[16], E[16]; real eigvals[4], wrk[20]; - printf("A = "); printmat_matlab(A,N,N); - printf("B = "); printmat_matlab(B,N,N); + printf("A = "); + printmat_matlab(A, N, N); + printf("B = "); + printmat_matlab(B, N, N); blasglue_gemm('N', 'N', N, N, N, 1.0, A, N, B, N, 0.0, C, N); printf("\nC = A * B\n"); - printmat(C,N,N); + printmat(C, N, N); blasglue_gemm('N', 'N', N, N, N, 1.0, C, N, B, N, 0.0, D, N); printf("\nC * B\n"); - printmat(D,N,N); + printmat(D, N, N); blasglue_herk('U', 'C', N, N, 1.0, A, N, 0.0, D, N); /* Now, copy the conjugate of the upper half @@ -104,7 +95,7 @@ int main(void) ASSIGN_CONJ(D[j * N + i], D[i * N + j]); } printf("\nD = A' * A\n"); - printmat(D,N,N); + printmat(D, N, N); lapackglue_potrf('U', N, D, N); lapackglue_potri('U', N, D, N); @@ -115,7 +106,7 @@ int main(void) ASSIGN_CONJ(D[j * N + i], D[i * N + j]); } printf("\ninverse(D)\n"); - printmat(D,N,N); + printmat(D, N, N); /* D = At * A, again */ blasglue_herk('U', 'C', N, N, 1.0, A, N, 0.0, D, N); @@ -124,46 +115,47 @@ int main(void) lapackglue_heev('V', 'U', N, D, N, eigvals, E, 16, wrk); /* Choose a deterministic phase for each row/eigenvector: */ for (i = 0; i < N; ++i) { - scalar phase; - real len; - for (j = 0; (len = sqrt(SCALAR_NORMSQR(D[i*N + j]))) < 1e-6; ++j) - ; - /* phase to make D[i*N+j] purely real: */ - ASSIGN_SCALAR(phase, SCALAR_RE(D[i*N+j])/len, -SCALAR_IM(D[i*N+j])/len); - ASSIGN_MULT(D[i*N+j], D[i*N+j], phase); - if (SCALAR_RE(D[i*N+j]) < 0) { /* pick deterministic (positive) sign */ - ASSIGN_SCALAR(phase, -SCALAR_RE(phase), -SCALAR_IM(phase)); - ASSIGN_SCALAR(D[i*N+j], -SCALAR_RE(D[i*N+j]),-SCALAR_IM(D[i*N+j])); - } - for (j = j + 1; j < N; ++j) - ASSIGN_MULT(D[i*N + j], D[i*N + j], phase); + scalar phase; + real len; + for (j = 0; (len = sqrt(SCALAR_NORMSQR(D[i * N + j]))) < 1e-6; ++j) + ; + /* phase to make D[i*N+j] purely real: */ + ASSIGN_SCALAR(phase, SCALAR_RE(D[i * N + j]) / len, -SCALAR_IM(D[i * N + j]) / len); + ASSIGN_MULT(D[i * N + j], D[i * N + j], phase); + if (SCALAR_RE(D[i * N + j]) < 0) { /* pick deterministic (positive) sign */ + ASSIGN_SCALAR(phase, -SCALAR_RE(phase), -SCALAR_IM(phase)); + ASSIGN_SCALAR(D[i * N + j], -SCALAR_RE(D[i * N + j]), -SCALAR_IM(D[i * N + j])); + } + for (j = j + 1; j < N; ++j) + ASSIGN_MULT(D[i * N + j], D[i * N + j], phase); } printf("\n[v,d] = eig(D);\n"); printf("\ndiag(d)\n "); - for (i = 0; i < 4; ++i) printf(" %6.3f", (double)eigvals[i]); + for (i = 0; i < 4; ++i) + printf(" %6.3f", (double)eigvals[i]); printf("\nv'\n"); - printmat(D,N,N); + printmat(D, N, N); blasglue_gemm('C', 'N', N, N, N, 1.0, D, N, D, N, 0.0, C, N); printf("\nv * v'\n"); - printmat(C,N,N); + printmat(C, N, N); /* Compute E = diag(sqrt(eigenvals)) * D; i.e. the rows of E become the rows of D times sqrt(corresponding eigenvalue) */ for (i = 0; i < N; ++i) { CHECK(eigvals[i] > 0, "non-positive eigenvalue"); - - blasglue_copy(N, D + i*N, 1, E + i*N, 1); - blasglue_rscal(N, sqrt(eigvals[i]), E + i*N, 1); + + blasglue_copy(N, D + i * N, 1, E + i * N, 1); + blasglue_rscal(N, sqrt(eigvals[i]), E + i * N, 1); } /* compute C = adjoint(D) * E == sqrt (At * A) */ blasglue_gemm('C', 'N', N, N, N, 1.0, D, N, E, N, 0.0, C, N); printf("\nsqrtm(D)\n"); - printmat(C,N,N); + printmat(C, N, N); blasglue_gemm('C', 'N', N, N, N, 1.0, E, N, E, N, 0.0, C, N); printf("\nsqrtm(D) * sqrtm(D)\n"); - printmat(C,N,N); + printmat(C, N, N); debug_check_memory_leaks(); diff --git a/tests/eigs_test.c b/tests/eigs_test.c index f02e9d1f..d0a5a7a2 100644 --- a/tests/eigs_test.c +++ b/tests/eigs_test.c @@ -28,391 +28,355 @@ static sqmatrix A, Ainv, B; -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#define MIN(a,b) ((a) < (b) ? (a) : (b)) - -extern void Aop(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work); -extern void Bop(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work); -extern void Ainvop(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY); -extern void Cop(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY); +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +extern void Aop(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, + evectmatrix Work); +extern void Bop(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, + evectmatrix Work); +extern void Ainvop(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, real *eigenvals, + sqmatrix YtY); +extern void Cop(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, real *eigenvals, + sqmatrix YtY); extern void printmat(scalar *A, int m, int n, int ldn); extern void printmat_matlab(scalar *A, int m, int n); extern void debug_check_memory_leaks(void); -real norm_diff(scalar *a, scalar *b, int n) -{ - real bmag = 0.0, diffmag = 0.0; - int i; - for (i = 0; i < n; ++i) { - scalar d; - ASSIGN_SCALAR(d, - SCALAR_RE(b[i]) - SCALAR_RE(a[i]), - SCALAR_IM(b[i]) - SCALAR_IM(a[i])); - bmag += SCALAR_NORMSQR(b[i]); - diffmag += SCALAR_NORMSQR(d); - } - return sqrt(diffmag / bmag); +real norm_diff(scalar *a, scalar *b, int n) { + real bmag = 0.0, diffmag = 0.0; + int i; + for (i = 0; i < n; ++i) { + scalar d; + ASSIGN_SCALAR(d, SCALAR_RE(b[i]) - SCALAR_RE(a[i]), SCALAR_IM(b[i]) - SCALAR_IM(a[i])); + bmag += SCALAR_NORMSQR(b[i]); + diffmag += SCALAR_NORMSQR(d); + } + return sqrt(diffmag / bmag); } #define NWORK 4 -void rand_posdef(sqmatrix A, sqmatrix X) -{ - int i, n = A.p; - - /* fill X with random data */ - for (i = 0; i < n*n; ++i) - ASSIGN_SCALAR(X.data[i], - rand() * 1.0 / RAND_MAX, - rand() * 1.0 / RAND_MAX); - - /* assign A = adjoint(X) * X to get a Hermitian matrix: */ - sqmatrix_AeBC(A, X, 1, X, 0); - - /* increase diagonal elements of A so that our preconditioner - has a chance of being useful: */ - for (i = 0; i < n; ++i) - ASSIGN_SCALAR(A.data[i * n + i], - n * SCALAR_RE(A.data[i * n + i]), - n * SCALAR_IM(A.data[i * n + i])); +void rand_posdef(sqmatrix A, sqmatrix X) { + int i, n = A.p; + + /* fill X with random data */ + for (i = 0; i < n * n; ++i) + ASSIGN_SCALAR(X.data[i], rand() * 1.0 / RAND_MAX, rand() * 1.0 / RAND_MAX); + + /* assign A = adjoint(X) * X to get a Hermitian matrix: */ + sqmatrix_AeBC(A, X, 1, X, 0); + + /* increase diagonal elements of A so that our preconditioner + has a chance of being useful: */ + for (i = 0; i < n; ++i) + ASSIGN_SCALAR(A.data[i * n + i], n * SCALAR_RE(A.data[i * n + i]), + n * SCALAR_IM(A.data[i * n + i])); } -int main(int argc, char **argv) -{ - int i, j, n = 0, p, trial; - sqmatrix X, U, YtY, Bcopy; - evectmatrix Y, Y2, Ystart, W[NWORK]; - real *eigvals, *eigvals_dense, sum = 0.0; - int num_iters, nWork = NWORK; - evectoperator bop = Bop; +int main(int argc, char **argv) { + int i, j, n = 0, p, trial; + sqmatrix X, U, YtY, Bcopy; + evectmatrix Y, Y2, Ystart, W[NWORK]; + real *eigvals, *eigvals_dense, sum = 0.0; + int num_iters, nWork = NWORK; + evectoperator bop = Bop; - if (argc >= 2) - n = atoi(argv[1]); + if (argc >= 2) n = atoi(argv[1]); - srand(argc >= 3 ? atoi(argv[2]) : time(NULL)); + srand(argc >= 3 ? atoi(argv[2]) : time(NULL)); - CHECK(n > 0, "illegal argument\nSyntax: eigs_test []"); + CHECK(n > 0, "illegal argument\nSyntax: eigs_test []"); - X = create_sqmatrix(n); - A = create_sqmatrix(n); - B = create_sqmatrix(n); - Bcopy = create_sqmatrix(n); - Ainv = create_sqmatrix(n); - U = create_sqmatrix(n); + X = create_sqmatrix(n); + A = create_sqmatrix(n); + B = create_sqmatrix(n); + Bcopy = create_sqmatrix(n); + Ainv = create_sqmatrix(n); + U = create_sqmatrix(n); - rand_posdef(A, X); - rand_posdef(B, X); + rand_posdef(A, X); + rand_posdef(B, X); #if 1 - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - ASSIGN_SCALAR(B.data[i * n + j], 0, 0); - } - ASSIGN_SCALAR(B.data[i * n + i], 1, 0); - } + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + ASSIGN_SCALAR(B.data[i * n + j], 0, 0); + } + ASSIGN_SCALAR(B.data[i * n + i], 1, 0); + } #endif - sqmatrix_copy(Ainv, A); - sqmatrix_invert(Ainv, 1, X); - - if (n <= 10) { - printf("Solving for eigenvalues of %d x %d matrix: \nA = ", n, n); - printmat_matlab(A.data, n, n); - printf("B = "); - printmat_matlab(B.data, n, n); - } - - CHK_MALLOC(eigvals_dense, real, n); - - p = MIN(MIN(5, MAX(n/4, 2)), n); - printf("\nSolving for %d eigenvals out of %d.\n", p, n); - YtY = create_sqmatrix(p); - Y = create_evectmatrix(n, 1, p, n, 0, n); - Y2 = create_evectmatrix(n, 1, p, n, 0, n); - Ystart = create_evectmatrix(n, 1, p, n, 0, n); - for (i = 0; i < nWork; ++i) - W[i] = create_evectmatrix(n, 1, p, n, 0, n); - CHK_MALLOC(eigvals, real, p); - - for (trial = 0; trial < 2; ++trial) { - sqmatrix_copy(U, A); - if (trial > 0) { - printf("=== Ordinary Eigenproblem ===\n"); - sqmatrix_eigensolve(U, eigvals_dense, X); - bop = NULL; - nWork = NWORK - 1; - } - else { - printf("=== Generalized Eigenproblem ===\n"); - sqmatrix_copy(Bcopy, B); - sqmatrix_gen_eigensolve(U, Bcopy, eigvals_dense, X); - } - - /* The eigenvectors are actually the columns of U'. Assign U = U': */ - for (i = 0; i < n; ++i) - for (j = i + 1; j < n; ++j) { - scalar dummy; - dummy = U.data[i*n + j]; - U.data[i*n + j] = U.data[j*n + i]; - U.data[j*n + i] = dummy; - } - for (i = 0; i < n * n; ++i) - ASSIGN_CONJ(U.data[i], U.data[i]); - - printf("\nSolved A by dense eigensolver.\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals_dense[i]; - printf(" %f", eigvals_dense[i]); - } - printf("\nEigenvalue sum = %f\n", sum); - printf("\nEigenvectors are (by column): \n"); - printmat(U.data, n, p, n); - - for (i = 0; i < n*p; ++i) - ASSIGN_REAL(Ystart.data[i], rand() * 1.0 / RAND_MAX); - - /* Check inverse Ainvop: */ - Aop(Ystart, Y, NULL, 0, Y2); - Ainvop(Y, Y2, NULL, Ystart, NULL, U); - printf("\nDifference |Y - (1/A)*A*Y| / |Y| = %g\n", - norm_diff(Ystart.data, Y2.data, Y.n * Y.p)); - - evectmatrix_copy(Y, Ystart); - eigensolver(Y, eigvals, Aop,NULL, bop,NULL, Cop,NULL, NULL,NULL, - W, nWork, 1e-10,&num_iters, EIGS_DEFAULT_FLAGS); - printf("\nSolved for eigenvectors after %d iterations.\n", num_iters); - printf("\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals[i]; - printf(" %f", eigvals[i]); - CHECK(fabs(eigvals[i]-eigvals_dense[i]) < 1e-5 * eigvals_dense[i], - "incorrect eigenvalue"); - } - printf("\nEigenvalue sum = %f\n", sum); - - /* Change phase of eigenvectors to match those solved for prev.: */ - for (i = 0; i < p; ++i) { - scalar phase; - - ASSIGN_DIV(phase, U.data[i], Y.data[i]); - - for (j = 0; j < n; ++j) { - ASSIGN_MULT(Y.data[j*p + i], Y.data[j*p + i], phase); - } - } - - printf("Eigenvectors are (by column): \n"); - printmat(Y.data, n, p, p); - evectmatrix_XtX(YtY, Y, U); - printf("adjoint(Y) * Y:\n"); - printmat(YtY.data, p, p, p); - - printf("\nSolving with exact inverse preconditioner...\n"); - evectmatrix_copy(Y, Ystart); - eigensolver(Y, eigvals, Aop,NULL, bop,NULL, Ainvop,NULL, NULL,NULL, - W, nWork, 1e-10, &num_iters, EIGS_DEFAULT_FLAGS); - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals[i]; - printf(" %f", eigvals[i]); - CHECK(fabs(eigvals[i]-eigvals_dense[i]) < 1e-5 * eigvals_dense[i], - "incorrect eigenvalue"); - } - printf("\nEigenvalue sum = %f\n", sum); - - printf("\nSolving without conjugate-gradient...\n"); - evectmatrix_copy(Y, Ystart); - eigensolver(Y, eigvals, Aop,NULL, bop,NULL, Cop,NULL, NULL,NULL, - W, nWork - 1, 1e-10, &num_iters, EIGS_DEFAULT_FLAGS); - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals[i]; - printf(" %f", eigvals[i]); - CHECK(fabs(eigvals[i]-eigvals_dense[i]) < 1e-5 * eigvals_dense[i], - "incorrect eigenvalue"); - } - printf("\nEigenvalue sum = %f\n", sum); - - printf("\nSolving without preconditioning...\n"); - evectmatrix_copy(Y, Ystart); - eigensolver(Y, eigvals, Aop,NULL, bop,NULL, NULL,NULL, NULL,NULL, - W, nWork, 1e-10, &num_iters, EIGS_DEFAULT_FLAGS); - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals[i]; - printf(" %f", eigvals[i]); - CHECK(fabs(eigvals[i]-eigvals_dense[i]) < 1e-5 * eigvals_dense[i], - "incorrect eigenvalue"); - } - printf("\nEigenvalue sum = %f\n", sum); - - printf("\nSolving without conjugate-gradient or preconditioning...\n"); - evectmatrix_copy(Y, Ystart); - eigensolver(Y, eigvals, Aop,NULL, bop,NULL, NULL,NULL, NULL,NULL, - W, nWork - 1, 1e-10, &num_iters, EIGS_DEFAULT_FLAGS); - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("\nEigenvalues = "); - for (sum = 0.0, i = 0; i < p; ++i) { - sum += eigvals[i]; - printf(" %f", eigvals[i]); - CHECK(fabs(eigvals[i]-eigvals_dense[i]) < 1e-5 * eigvals_dense[i], - "incorrect eigenvalue"); - } - printf("\nEigenvalue sum = %f\n", sum); - } - - destroy_sqmatrix(A); - destroy_sqmatrix(B); - destroy_sqmatrix(Bcopy); - destroy_sqmatrix(Ainv); - destroy_sqmatrix(X); - destroy_sqmatrix(U); - destroy_sqmatrix(YtY); - destroy_evectmatrix(Y); - destroy_evectmatrix(Y2); - destroy_evectmatrix(Ystart); - for (i = 0; i < nWork; ++i) - destroy_evectmatrix(W[i]); - - free(eigvals); - free(eigvals_dense); - - debug_check_memory_leaks(); - - return EXIT_SUCCESS; + sqmatrix_copy(Ainv, A); + sqmatrix_invert(Ainv, 1, X); + + if (n <= 10) { + printf("Solving for eigenvalues of %d x %d matrix: \nA = ", n, n); + printmat_matlab(A.data, n, n); + printf("B = "); + printmat_matlab(B.data, n, n); + } + + CHK_MALLOC(eigvals_dense, real, n); + + p = MIN(MIN(5, MAX(n / 4, 2)), n); + printf("\nSolving for %d eigenvals out of %d.\n", p, n); + YtY = create_sqmatrix(p); + Y = create_evectmatrix(n, 1, p, n, 0, n); + Y2 = create_evectmatrix(n, 1, p, n, 0, n); + Ystart = create_evectmatrix(n, 1, p, n, 0, n); + for (i = 0; i < nWork; ++i) + W[i] = create_evectmatrix(n, 1, p, n, 0, n); + CHK_MALLOC(eigvals, real, p); + + for (trial = 0; trial < 2; ++trial) { + sqmatrix_copy(U, A); + if (trial > 0) { + printf("=== Ordinary Eigenproblem ===\n"); + sqmatrix_eigensolve(U, eigvals_dense, X); + bop = NULL; + nWork = NWORK - 1; + } + else { + printf("=== Generalized Eigenproblem ===\n"); + sqmatrix_copy(Bcopy, B); + sqmatrix_gen_eigensolve(U, Bcopy, eigvals_dense, X); + } + + /* The eigenvectors are actually the columns of U'. Assign U = U': */ + for (i = 0; i < n; ++i) + for (j = i + 1; j < n; ++j) { + scalar dummy; + dummy = U.data[i * n + j]; + U.data[i * n + j] = U.data[j * n + i]; + U.data[j * n + i] = dummy; + } + for (i = 0; i < n * n; ++i) + ASSIGN_CONJ(U.data[i], U.data[i]); + + printf("\nSolved A by dense eigensolver.\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals_dense[i]; + printf(" %f", eigvals_dense[i]); + } + printf("\nEigenvalue sum = %f\n", sum); + printf("\nEigenvectors are (by column): \n"); + printmat(U.data, n, p, n); + + for (i = 0; i < n * p; ++i) + ASSIGN_REAL(Ystart.data[i], rand() * 1.0 / RAND_MAX); + + /* Check inverse Ainvop: */ + Aop(Ystart, Y, NULL, 0, Y2); + Ainvop(Y, Y2, NULL, Ystart, NULL, U); + printf("\nDifference |Y - (1/A)*A*Y| / |Y| = %g\n", norm_diff(Ystart.data, Y2.data, Y.n * Y.p)); + + evectmatrix_copy(Y, Ystart); + eigensolver(Y, eigvals, Aop, NULL, bop, NULL, Cop, NULL, NULL, NULL, W, nWork, 1e-10, + &num_iters, EIGS_DEFAULT_FLAGS); + printf("\nSolved for eigenvectors after %d iterations.\n", num_iters); + printf("\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals[i]; + printf(" %f", eigvals[i]); + CHECK(fabs(eigvals[i] - eigvals_dense[i]) < 1e-5 * eigvals_dense[i], "incorrect eigenvalue"); + } + printf("\nEigenvalue sum = %f\n", sum); + + /* Change phase of eigenvectors to match those solved for prev.: */ + for (i = 0; i < p; ++i) { + scalar phase; + + ASSIGN_DIV(phase, U.data[i], Y.data[i]); + + for (j = 0; j < n; ++j) { + ASSIGN_MULT(Y.data[j * p + i], Y.data[j * p + i], phase); + } + } + + printf("Eigenvectors are (by column): \n"); + printmat(Y.data, n, p, p); + evectmatrix_XtX(YtY, Y, U); + printf("adjoint(Y) * Y:\n"); + printmat(YtY.data, p, p, p); + + printf("\nSolving with exact inverse preconditioner...\n"); + evectmatrix_copy(Y, Ystart); + eigensolver(Y, eigvals, Aop, NULL, bop, NULL, Ainvop, NULL, NULL, NULL, W, nWork, 1e-10, + &num_iters, EIGS_DEFAULT_FLAGS); + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals[i]; + printf(" %f", eigvals[i]); + CHECK(fabs(eigvals[i] - eigvals_dense[i]) < 1e-5 * eigvals_dense[i], "incorrect eigenvalue"); + } + printf("\nEigenvalue sum = %f\n", sum); + + printf("\nSolving without conjugate-gradient...\n"); + evectmatrix_copy(Y, Ystart); + eigensolver(Y, eigvals, Aop, NULL, bop, NULL, Cop, NULL, NULL, NULL, W, nWork - 1, 1e-10, + &num_iters, EIGS_DEFAULT_FLAGS); + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals[i]; + printf(" %f", eigvals[i]); + CHECK(fabs(eigvals[i] - eigvals_dense[i]) < 1e-5 * eigvals_dense[i], "incorrect eigenvalue"); + } + printf("\nEigenvalue sum = %f\n", sum); + + printf("\nSolving without preconditioning...\n"); + evectmatrix_copy(Y, Ystart); + eigensolver(Y, eigvals, Aop, NULL, bop, NULL, NULL, NULL, NULL, NULL, W, nWork, 1e-10, + &num_iters, EIGS_DEFAULT_FLAGS); + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals[i]; + printf(" %f", eigvals[i]); + CHECK(fabs(eigvals[i] - eigvals_dense[i]) < 1e-5 * eigvals_dense[i], "incorrect eigenvalue"); + } + printf("\nEigenvalue sum = %f\n", sum); + + printf("\nSolving without conjugate-gradient or preconditioning...\n"); + evectmatrix_copy(Y, Ystart); + eigensolver(Y, eigvals, Aop, NULL, bop, NULL, NULL, NULL, NULL, NULL, W, nWork - 1, 1e-10, + &num_iters, EIGS_DEFAULT_FLAGS); + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("\nEigenvalues = "); + for (sum = 0.0, i = 0; i < p; ++i) { + sum += eigvals[i]; + printf(" %f", eigvals[i]); + CHECK(fabs(eigvals[i] - eigvals_dense[i]) < 1e-5 * eigvals_dense[i], "incorrect eigenvalue"); + } + printf("\nEigenvalue sum = %f\n", sum); + } + + destroy_sqmatrix(A); + destroy_sqmatrix(B); + destroy_sqmatrix(Bcopy); + destroy_sqmatrix(Ainv); + destroy_sqmatrix(X); + destroy_sqmatrix(U); + destroy_sqmatrix(YtY); + destroy_evectmatrix(Y); + destroy_evectmatrix(Y2); + destroy_evectmatrix(Ystart); + for (i = 0; i < nWork; ++i) + destroy_evectmatrix(W[i]); + + free(eigvals); + free(eigvals_dense); + + debug_check_memory_leaks(); + + return EXIT_SUCCESS; } -void Aop(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, - "matrices not conformant"); +void Aop(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, + evectmatrix Work) { + CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, "matrices not conformant"); - blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, - 1.0, A.data, A.p, Xin.data, Xin.p, 0.0, Xout.data, Xout.p); + blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, 1.0, A.data, A.p, Xin.data, Xin.p, 0.0, Xout.data, + Xout.p); } -void Bop(evectmatrix Xin, evectmatrix Xout, void *data, - int is_current_eigenvector, evectmatrix Work) -{ - CHECK(B.p == Xin.n && B.p == Xout.n && Xin.p == Xout.p, - "matrices not conformant"); +void Bop(evectmatrix Xin, evectmatrix Xout, void *data, int is_current_eigenvector, + evectmatrix Work) { + CHECK(B.p == Xin.n && B.p == Xout.n && Xin.p == Xout.p, "matrices not conformant"); - blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, - 1.0, B.data, B.p, Xin.data, Xin.p, 0.0, Xout.data, Xout.p); + blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, 1.0, B.data, B.p, Xin.data, Xin.p, 0.0, Xout.data, + Xout.p); } -void Ainvop(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY) -{ - CHECK(Ainv.p == Xin.n && Ainv.p == Xout.n && Xin.p == Xout.p, - "matrices not conformant"); +void Ainvop(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, real *eigenvals, + sqmatrix YtY) { + CHECK(Ainv.p == Xin.n && Ainv.p == Xout.n && Xin.p == Xout.p, "matrices not conformant"); - blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, - 1.0, Ainv.data, Ainv.p, - Xin.data, Xin.p, 0.0, Xout.data, Xout.p); + blasglue_gemm('N', 'N', Xout.n, Xout.p, Xin.n, 1.0, Ainv.data, Ainv.p, Xin.data, Xin.p, 0.0, + Xout.data, Xout.p); } -void Cop_old(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY) -{ - int in, ip; - - CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, - "matrices not conformant"); - - evectmatrix_XeYS(Xout, Xin, YtY, 1); - - for (in = 0; in < Xout.n; ++in) { - real diag; - - diag = SCALAR_NORMSQR(A.data[in * A.p + in]); - diag = (diag == 0.0) ? 1.0 : 1.0 / sqrt(diag); - - for (ip = 0; ip < Xout.p; ++ip) { - scalar xin = Xout.data[in * Xout.p + ip]; - ASSIGN_SCALAR(Xout.data[in * Xout.p + ip], - diag * SCALAR_RE(xin), - diag * SCALAR_IM(xin)); - } - } +void Cop_old(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, real *eigenvals, + sqmatrix YtY) { + int in, ip; + + CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, "matrices not conformant"); + + evectmatrix_XeYS(Xout, Xin, YtY, 1); + + for (in = 0; in < Xout.n; ++in) { + real diag; + + diag = SCALAR_NORMSQR(A.data[in * A.p + in]); + diag = (diag == 0.0) ? 1.0 : 1.0 / sqrt(diag); + + for (ip = 0; ip < Xout.p; ++ip) { + scalar xin = Xout.data[in * Xout.p + ip]; + ASSIGN_SCALAR(Xout.data[in * Xout.p + ip], diag * SCALAR_RE(xin), diag * SCALAR_IM(xin)); + } + } } -void Cop(evectmatrix Xin, evectmatrix Xout, void *data, - evectmatrix Y, real *eigenvals, sqmatrix YtY) -{ - int in, ip; - - CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, - "matrices not conformant"); - - evectmatrix_XeYS(Xout, Xin, YtY, 1); - - for (in = 0; in < Xout.n; ++in) { - scalar diag = A.data[in * A.p + in]; - - for (ip = 0; ip < Xout.p; ++ip) { - scalar scale; - if (eigenvals) { - ASSIGN_SCALAR(scale, - SCALAR_RE(diag) - 0*eigenvals[ip], - SCALAR_IM(diag)); - } - else - scale = diag; - ASSIGN_DIV(Xout.data[in * Xout.p + ip], - Xout.data[in * Xout.p + ip], - scale); - } - } +void Cop(evectmatrix Xin, evectmatrix Xout, void *data, evectmatrix Y, real *eigenvals, + sqmatrix YtY) { + int in, ip; + + CHECK(A.p == Xin.n && A.p == Xout.n && Xin.p == Xout.p, "matrices not conformant"); + + evectmatrix_XeYS(Xout, Xin, YtY, 1); + + for (in = 0; in < Xout.n; ++in) { + scalar diag = A.data[in * A.p + in]; + + for (ip = 0; ip < Xout.p; ++ip) { + scalar scale; + if (eigenvals) { ASSIGN_SCALAR(scale, SCALAR_RE(diag) - 0 * eigenvals[ip], SCALAR_IM(diag)); } + else + scale = diag; + ASSIGN_DIV(Xout.data[in * Xout.p + ip], Xout.data[in * Xout.p + ip], scale); + } + } } -void printmat(scalar *A, int m, int n, int ldn) -{ +void printmat(scalar *A, int m, int n, int ldn) { int i, j; for (i = 0; i < m; ++i) { - for (j = 0; j < n; ++j) { + for (j = 0; j < n; ++j) { #ifdef SCALAR_COMPLEX - printf(" (%6.3f,%6.3f)", A[i*ldn + j].re, A[i*ldn + j].im); + printf(" (%6.3f,%6.3f)", A[i * ldn + j].re, A[i * ldn + j].im); #else - printf(" %6.3f", A[i*ldn + j]); + printf(" %6.3f", A[i * ldn + j]); #endif - if (j > 7) { - printf(" ..."); - break; - } - } - printf("\n"); - if (i > 7) { - printf(" ...\n"); - break; - } + if (j > 7) { + printf(" ..."); + break; + } + } + printf("\n"); + if (i > 7) { + printf(" ...\n"); + break; + } } } -void printmat_matlab(scalar *A, int m, int n) -{ +void printmat_matlab(scalar *A, int m, int n) { int i, j; printf("["); for (i = 0; i < m; ++i) { - for (j = 0; j < n; ++j) { + for (j = 0; j < n; ++j) { #ifdef SCALAR_COMPLEX - printf(" %g+%gi", A[i*n + j].re, A[i*n + j].im); + printf(" %g+%gi", A[i * n + j].re, A[i * n + j].im); #else - printf(" %g", A[i*n + j]); + printf(" %g", A[i * n + j]); #endif - } + } printf(";\n"); } printf("]\n"); diff --git a/tests/malloctest.c b/tests/malloctest.c index 165c7e81..20932500 100644 --- a/tests/malloctest.c +++ b/tests/malloctest.c @@ -26,52 +26,47 @@ #define MAX_SIZE 65536 #define NUM_MALLOCS 3000000 -int main(void) -{ - char *pointers[NUM_POINTERS]; - int i, iter; - clock_t start_time; - double elapsed_time; +int main(void) { + char *pointers[NUM_POINTERS]; + int i, iter; + clock_t start_time; + double elapsed_time; #ifdef DEBUG_MALLOC - printf("Using debug_malloc and debug_free routines.\n"); + printf("Using debug_malloc and debug_free routines.\n"); #else - fprintf(stderr, - "***** NOTE: malloctest is designed to be run when the package\n" - " is configured with --enable-debug, to test the\n" - " debugging malloc/free routines.\n"); + fprintf(stderr, "***** NOTE: malloctest is designed to be run when the package\n" + " is configured with --enable-debug, to test the\n" + " debugging malloc/free routines.\n"); #endif - srand(time(NULL)); + srand(time(NULL)); - for (i = 0; i < NUM_POINTERS; ++i) - pointers[i] = NULL; + for (i = 0; i < NUM_POINTERS; ++i) + pointers[i] = NULL; - printf("Doing %d malloc/free calls...\n", NUM_MALLOCS); - start_time = clock(); - for (iter = 0; iter < NUM_MALLOCS; ++iter) { - i = rand() % NUM_POINTERS; - if (pointers[i]) - free(pointers[i]); - CHK_MALLOC(pointers[i], char, rand() % MAX_SIZE + 1); - if ((iter + 1) % (NUM_MALLOCS / 20) == 0) - printf("...completed %d...\n", iter + 1); - } - elapsed_time = (clock() - start_time) * 1.0 / CLOCKS_PER_SEC; - printf("Done.\n"); + printf("Doing %d malloc/free calls...\n", NUM_MALLOCS); + start_time = clock(); + for (iter = 0; iter < NUM_MALLOCS; ++iter) { + i = rand() % NUM_POINTERS; + if (pointers[i]) free(pointers[i]); + CHK_MALLOC(pointers[i], char, rand() % MAX_SIZE + 1); + if ((iter + 1) % (NUM_MALLOCS / 20) == 0) printf("...completed %d...\n", iter + 1); + } + elapsed_time = (clock() - start_time) * 1.0 / CLOCKS_PER_SEC; + printf("Done.\n"); - printf("Total time = %g seconds, %g us per iteration\n", - elapsed_time, elapsed_time * 1e6 / NUM_MALLOCS); + printf("Total time = %g seconds, %g us per iteration\n", elapsed_time, + elapsed_time * 1e6 / NUM_MALLOCS); - for (i = 0; i < NUM_POINTERS; ++i) - if (pointers[i]) - free(pointers[i]); + for (i = 0; i < NUM_POINTERS; ++i) + if (pointers[i]) free(pointers[i]); #ifdef DEBUG_MALLOC - debug_check_memory_leaks(); + debug_check_memory_leaks(); #endif - - printf("Okay.\n"); - return EXIT_SUCCESS; + printf("Okay.\n"); + + return EXIT_SUCCESS; } diff --git a/tests/maxwell_test.c b/tests/maxwell_test.c index 8f8c0f23..4b12c6b7 100644 --- a/tests/maxwell_test.c +++ b/tests/maxwell_test.c @@ -28,20 +28,20 @@ #include #if defined(DEBUG) && defined(HAVE_FEENABLEEXCEPT) -# ifndef _GNU_SOURCE -# define _GNU_SOURCE 1 -# endif -# include -# if !HAVE_DECL_FEENABLEEXCEPT -int feenableexcept (int EXCEPTS); -# endif +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include +#if !HAVE_DECL_FEENABLEEXCEPT +int feenableexcept(int EXCEPTS); +#endif #endif #if defined(HAVE_GETOPT_H) -# include +#include #endif #if defined(HAVE_UNISTD_H) -# include +#include #endif #define NX 32 @@ -60,9 +60,9 @@ int feenableexcept (int EXCEPTS); #define ERROR_TOL 1e-4 #ifdef ENABLE_PROF -# define PROF_ITERS 10 +#define PROF_ITERS 10 #else -# define PROF_ITERS 1 +#define PROF_ITERS 1 #endif #define MESH_SIZE 7 @@ -70,40 +70,37 @@ int feenableexcept (int EXCEPTS); /*************************************************************************/ typedef struct { - real eps_high, eps_low, eps_high_x; + real eps_high, eps_low, eps_high_x; } epsilon_data; #define INVERSION_SYM 1 -static void epsilon(symmetric_matrix *eps, symmetric_matrix *eps_inv, - const real r[3], void *edata_v) -{ - epsilon_data *edata = (epsilon_data *) edata_v; - real eps_val; +static void epsilon(symmetric_matrix *eps, symmetric_matrix *eps_inv, const real r[3], + void *edata_v) { + epsilon_data *edata = (epsilon_data *)edata_v; + real eps_val; #if INVERSION_SYM - if (fabs(r[0]) < 0.5*edata->eps_high_x - || fabs(r[0]-1.0) < 0.5*edata->eps_high_x) - eps_val = edata->eps_high; + if (fabs(r[0]) < 0.5 * edata->eps_high_x || fabs(r[0] - 1.0) < 0.5 * edata->eps_high_x) + eps_val = edata->eps_high; #else - if ((r[0] < edata->eps_high_x && r[0] >= 0.0) || - (r[0] >= 1.0 && r[0] - 1.0 < edata->eps_high_x)) - eps_val = edata->eps_high; + if ((r[0] < edata->eps_high_x && r[0] >= 0.0) || (r[0] >= 1.0 && r[0] - 1.0 < edata->eps_high_x)) + eps_val = edata->eps_high; #endif - else - eps_val = edata->eps_low; - eps->m00 = eps->m11 = eps->m22 = eps_val; - eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; + else + eps_val = edata->eps_low; + eps->m00 = eps->m11 = eps->m22 = eps_val; + eps_inv->m00 = eps_inv->m11 = eps_inv->m22 = 1.0 / eps_val; #ifdef WITH_HERMITIAN_EPSILON - CASSIGN_ZERO(eps->m01); - CASSIGN_ZERO(eps->m02); - CASSIGN_ZERO(eps->m12); - CASSIGN_ZERO(eps_inv->m01); - CASSIGN_ZERO(eps_inv->m02); - CASSIGN_ZERO(eps_inv->m12); + CASSIGN_ZERO(eps->m01); + CASSIGN_ZERO(eps->m02); + CASSIGN_ZERO(eps->m12); + CASSIGN_ZERO(eps_inv->m01); + CASSIGN_ZERO(eps_inv->m02); + CASSIGN_ZERO(eps_inv->m12); #else - eps->m01 = eps->m02 = eps->m12 = 0.0; - eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; + eps->m01 = eps->m02 = eps->m12 = 0.0; + eps_inv->m01 = eps_inv->m02 = eps_inv->m12 = 0.0; #endif } @@ -120,463 +117,390 @@ static const double TWOPI = 6.2831853071795864769252867665590057683943388; n1 and n2 are the indices of the two dielectrics, and f1 and f2 are their thicknesses as a fraction of the lattice constant (we should have f1 + f2 == 1). */ -real analytic_bragg_k(real omega, real n1, real f1, real n2, real f2) -{ - real phase1, phase2, c1, s1, c2, s2, b; +real analytic_bragg_k(real omega, real n1, real f1, real n2, real f2) { + real phase1, phase2, c1, s1, c2, s2, b; - CHECK(fabs(f1 + f2 - 1) < 1e-6, "invalid params to analytic_bragg_k"); + CHECK(fabs(f1 + f2 - 1) < 1e-6, "invalid params to analytic_bragg_k"); - phase1 = TWOPI * n1 * f1 * omega; - phase2 = TWOPI * n2 * f2 * omega; - c1 = cos(phase1); s1 = sin(phase1); - c2 = cos(phase2); s2 = sin(phase2); + phase1 = TWOPI * n1 * f1 * omega; + phase2 = TWOPI * n2 * f2 * omega; + c1 = cos(phase1); + s1 = sin(phase1); + c2 = cos(phase2); + s2 = sin(phase2); - b = c1*c2 - 0.5 * (n1/n2 + n2/n1) * s1*s2; + b = c1 * c2 - 0.5 * (n1 / n2 + n2 / n1) * s1 * s2; - if (fabs(b) > 1) - return (-1.0); + if (fabs(b) > 1) return (-1.0); - return fabs(atan2(sqrt(1-b*b), b) / TWOPI); + return fabs(atan2(sqrt(1 - b * b), b) / TWOPI); } /* Solve for Bragg omega for the given k and other parameters, - using omega_guess as a starting guess. + using omega_guess as a starting guess. We can't do anything clever like Newton's method or even an ordinary bisection search because there are regions of omega in which analytic_bragg_k is not defined (i.e. in the band gap). */ -real bragg_omega(real omega_guess, - real k, - real n1, real f1, real n2, real f2, - real tolerance) -{ - real omega_guess_low = omega_guess - 0.2, - omega_guess_high = omega_guess + 0.2; - real k_cur; - real k_best = -1.0, omega_best = 0.0; - real tol; - - if (omega_guess_low < 0.0) - omega_guess_low = 0.0; - - for (tol = (omega_guess_high - omega_guess_low) / 10.0; - tol > tolerance; - tol *= 0.25) { - for (omega_guess = omega_guess_low + tol; - omega_guess < omega_guess_high; - omega_guess += tol) { - k_cur = analytic_bragg_k(omega_guess, n1, f1, n2, f2); - if (fabs(k_cur - k) < fabs(k_best - k)) { - k_best = k_cur; - omega_best = omega_guess; - } - } - - CHECK(k_best > 0.0, "No valid omega values in guess range!"); - - omega_guess_low = omega_best - tol; - omega_guess_high = omega_best + tol; - } - - return omega_best; +real bragg_omega(real omega_guess, real k, real n1, real f1, real n2, real f2, real tolerance) { + real omega_guess_low = omega_guess - 0.2, omega_guess_high = omega_guess + 0.2; + real k_cur; + real k_best = -1.0, omega_best = 0.0; + real tol; + + if (omega_guess_low < 0.0) omega_guess_low = 0.0; + + for (tol = (omega_guess_high - omega_guess_low) / 10.0; tol > tolerance; tol *= 0.25) { + for (omega_guess = omega_guess_low + tol; omega_guess < omega_guess_high; omega_guess += tol) { + k_cur = analytic_bragg_k(omega_guess, n1, f1, n2, f2); + if (fabs(k_cur - k) < fabs(k_best - k)) { + k_best = k_cur; + omega_best = omega_guess; + } + } + + CHECK(k_best > 0.0, "No valid omega values in guess range!"); + + omega_guess_low = omega_best - tol; + omega_guess_high = omega_best + tol; + } + + return omega_best; } - /*************************************************************************/ -real norm_diff(scalar *a, scalar *b, int n) -{ - real bmag = 0.0, diffmag = 0.0; - int i; - for (i = 0; i < n; ++i) { - scalar d; - ASSIGN_SCALAR(d, - SCALAR_RE(b[i]) - SCALAR_RE(a[i]), - SCALAR_IM(b[i]) - SCALAR_IM(a[i])); - bmag += SCALAR_NORMSQR(b[i]); - diffmag += SCALAR_NORMSQR(d); - } - return sqrt(diffmag / bmag); +real norm_diff(scalar *a, scalar *b, int n) { + real bmag = 0.0, diffmag = 0.0; + int i; + for (i = 0; i < n; ++i) { + scalar d; + ASSIGN_SCALAR(d, SCALAR_RE(b[i]) - SCALAR_RE(a[i]), SCALAR_IM(b[i]) - SCALAR_IM(a[i])); + bmag += SCALAR_NORMSQR(b[i]); + diffmag += SCALAR_NORMSQR(d); + } + return sqrt(diffmag / bmag); } /*************************************************************************/ -void usage(void) -{ - printf("Syntax: maxwell_test [options]\n" - "Options:\n" - " -h Print this help\n" - " -s Set random seed\n" - " -k Set kx wavevector component [dflt. = %f]\n" - " -b Compute n bands [default = %d]\n" - " -n Specify high-dielectric index [dflt. = %f]\n" - " -f Specify high-index fill fraction [dflt. = %f]\n" - " -x Use nx points in x direction [dflt. = %d]\n" - " -y Use ny points in y direction [dflt. = %d]\n" - " -z Use nz points in z direction [dflt. = %d]\n" - " -e Solve for TE polarization only.\n" - " -m Solve for TM polarization only.\n" - " -t Set target frequency [dflt. none].\n" - " -c Set convergence tolerance [dflt. %e].\n" - " -g Set mesh size [dflt. %d].\n" - " -1 Stop after first computation.\n" - " -p Use simple preconditioner.\n" - " -E Exit with error if the error exceeds \n" - " -v Verbose output.\n", - KX, NUM_BANDS, sqrt(EPS_HIGH), EPS_HIGH_X, NX, NY, NZ, - ERROR_TOL, MESH_SIZE); +void usage(void) { + printf("Syntax: maxwell_test [options]\n" + "Options:\n" + " -h Print this help\n" + " -s Set random seed\n" + " -k Set kx wavevector component [dflt. = %f]\n" + " -b Compute n bands [default = %d]\n" + " -n Specify high-dielectric index [dflt. = %f]\n" + " -f Specify high-index fill fraction [dflt. = %f]\n" + " -x Use nx points in x direction [dflt. = %d]\n" + " -y Use ny points in y direction [dflt. = %d]\n" + " -z Use nz points in z direction [dflt. = %d]\n" + " -e Solve for TE polarization only.\n" + " -m Solve for TM polarization only.\n" + " -t Set target frequency [dflt. none].\n" + " -c Set convergence tolerance [dflt. %e].\n" + " -g Set mesh size [dflt. %d].\n" + " -1 Stop after first computation.\n" + " -p Use simple preconditioner.\n" + " -E Exit with error if the error exceeds \n" + " -v Verbose output.\n", + KX, NUM_BANDS, sqrt(EPS_HIGH), EPS_HIGH_X, NX, NY, NZ, ERROR_TOL, MESH_SIZE); } /*************************************************************************/ -int main(int argc, char **argv) -{ - maxwell_data *mdata; - maxwell_target_data *mtdata = NULL; - int local_N, N_start, alloc_N; - real R[3][3] = { {1,0,0}, {0,0.01,0}, {0,0,0.01} }; - real G[3][3] = { {1,0,0}, {0,100,0}, {0,0,100} }; - real kvector[3] = {KX,0,0}; - evectmatrix H, Hstart, W[NWORK]; - real *eigvals; - int i, iters; - int num_iters; - int parity = NO_PARITY; - int nx = NX, ny = NY, nz = NZ; - int num_bands = NUM_BANDS; - real target_freq = 0.0; - int do_target = 0; - evectoperator op; - evectpreconditioner pre_op; - void *op_data, *pre_op_data; - real error_tol = ERROR_TOL; - int mesh_size = MESH_SIZE, mesh[3]; - epsilon_data ed; - int stop1 = 0; - int verbose = 0; - int which_preconditioner = 2; - double max_err = 1e20; - - srand(time(NULL)); +int main(int argc, char **argv) { + maxwell_data *mdata; + maxwell_target_data *mtdata = NULL; + int local_N, N_start, alloc_N; + real R[3][3] = {{1, 0, 0}, {0, 0.01, 0}, {0, 0, 0.01}}; + real G[3][3] = {{1, 0, 0}, {0, 100, 0}, {0, 0, 100}}; + real kvector[3] = {KX, 0, 0}; + evectmatrix H, Hstart, W[NWORK]; + real *eigvals; + int i, iters; + int num_iters; + int parity = NO_PARITY; + int nx = NX, ny = NY, nz = NZ; + int num_bands = NUM_BANDS; + real target_freq = 0.0; + int do_target = 0; + evectoperator op; + evectpreconditioner pre_op; + void *op_data, *pre_op_data; + real error_tol = ERROR_TOL; + int mesh_size = MESH_SIZE, mesh[3]; + epsilon_data ed; + int stop1 = 0; + int verbose = 0; + int which_preconditioner = 2; + double max_err = 1e20; + + srand(time(NULL)); #if defined(DEBUG) && defined(HAVE_FEENABLEEXCEPT) - feenableexcept(FE_INVALID | FE_OVERFLOW); /* crash on NaN/overflow */ + feenableexcept(FE_INVALID | FE_OVERFLOW); /* crash on NaN/overflow */ #endif - - ed.eps_high = EPS_HIGH; - ed.eps_low = EPS_LOW; - ed.eps_high_x = EPS_HIGH_X; + ed.eps_high = EPS_HIGH; + ed.eps_low = EPS_LOW; + ed.eps_high_x = EPS_HIGH_X; #ifdef HAVE_GETOPT - { - extern char *optarg; - extern int optind; - int c; - - while ((c = getopt(argc, argv, "hs:k:b:n:f:x:y:z:emt:c:g:1pvE:")) - != -1) - switch (c) { - case 'h': - usage(); - exit(EXIT_SUCCESS); - break; - case 's': - srand(atoi(optarg)); - break; - case 'k': - kvector[0] = atof(optarg); - break; - case 'b': - num_bands = atoi(optarg); - CHECK(num_bands > 0, "num_bands must be positive"); - break; - case 'n': - ed.eps_high = atof(optarg); - CHECK(ed.eps_high > 0.0, "index must be positive"); - ed.eps_high = ed.eps_high * ed.eps_high; - break; - case 'f': - ed.eps_high_x = atof(optarg); - CHECK(ed.eps_high_x > 0.0, "fill must be positive"); - break; - case 'x': - nx = atoi(optarg); - CHECK(nx > 0, "x size must be positive"); - break; - case 'y': - ny = atoi(optarg); - CHECK(ny > 0, "y size must be positive"); - break; - case 'z': - nz = atoi(optarg); - CHECK(nz > 0, "z size must be positive"); - break; - case 'e': - parity = EVEN_Z_PARITY; - break; - case 'm': - parity = ODD_Z_PARITY; - break; - case 't': - target_freq = fabs(atof(optarg)); - do_target = 1; - break; - case 'E': - max_err = fabs(atof(optarg)); - CHECK(max_err > 0, "maximum error must be positive"); - break; - case 'c': - error_tol = fabs(atof(optarg)); - break; - case 'g': - mesh_size = atoi(optarg); - CHECK(mesh_size > 0, "mesh size must be positive"); - break; - case '1': - stop1 = 1; - break; - case 'p': - which_preconditioner = 1; - break; - case 'v': - verbose = 1; - break; - default: - usage(); - exit(EXIT_FAILURE); - } - - if (argc != optind) { - usage(); - exit(EXIT_FAILURE); - } - } + { + extern char *optarg; + extern int optind; + int c; + + while ((c = getopt(argc, argv, "hs:k:b:n:f:x:y:z:emt:c:g:1pvE:")) != -1) + switch (c) { + case 'h': + usage(); + exit(EXIT_SUCCESS); + break; + case 's': srand(atoi(optarg)); break; + case 'k': kvector[0] = atof(optarg); break; + case 'b': + num_bands = atoi(optarg); + CHECK(num_bands > 0, "num_bands must be positive"); + break; + case 'n': + ed.eps_high = atof(optarg); + CHECK(ed.eps_high > 0.0, "index must be positive"); + ed.eps_high = ed.eps_high * ed.eps_high; + break; + case 'f': + ed.eps_high_x = atof(optarg); + CHECK(ed.eps_high_x > 0.0, "fill must be positive"); + break; + case 'x': + nx = atoi(optarg); + CHECK(nx > 0, "x size must be positive"); + break; + case 'y': + ny = atoi(optarg); + CHECK(ny > 0, "y size must be positive"); + break; + case 'z': + nz = atoi(optarg); + CHECK(nz > 0, "z size must be positive"); + break; + case 'e': parity = EVEN_Z_PARITY; break; + case 'm': parity = ODD_Z_PARITY; break; + case 't': + target_freq = fabs(atof(optarg)); + do_target = 1; + break; + case 'E': + max_err = fabs(atof(optarg)); + CHECK(max_err > 0, "maximum error must be positive"); + break; + case 'c': error_tol = fabs(atof(optarg)); break; + case 'g': + mesh_size = atoi(optarg); + CHECK(mesh_size > 0, "mesh size must be positive"); + break; + case '1': stop1 = 1; break; + case 'p': which_preconditioner = 1; break; + case 'v': verbose = 1; break; + default: usage(); exit(EXIT_FAILURE); + } + + if (argc != optind) { + usage(); + exit(EXIT_FAILURE); + } + } #endif #ifdef ENABLE_PROF - stop1 = 1; + stop1 = 1; #endif - mesh[0] = mesh[1] = mesh[2] = mesh_size; - - printf("Creating Maxwell data...\n"); - mdata = create_maxwell_data(nx, ny, nz, &local_N, &N_start, &alloc_N, - num_bands, NUM_FFT_BANDS); - CHECK(mdata, "NULL mdata"); - - set_maxwell_data_parity(mdata, parity); - - printf("Setting k vector to (%g, %g, %g)...\n", - kvector[0], kvector[1], kvector[2]); - update_maxwell_data_k(mdata, kvector, G[0], G[1], G[2]); - - printf("Initializing dielectric...\n"); - /* set up dielectric function (a simple Bragg mirror) */ - set_maxwell_dielectric(mdata, mesh, R, G, epsilon, 0, &ed); - - if (verbose && ny == 1 && nz == 1) { - printf("dielectric function:\n"); - for (i = 0; i < nx; ++i) { - if (mdata->eps_inv[i].m00 == mdata->eps_inv[i].m11) - printf(" eps(%g) = %g\n", i * 1.0 / nx, - 1.0/mdata->eps_inv[i].m00); - - else - printf(" eps(%g) = x: %g OR y: %g\n", i * 1.0 / nx, - 1.0/mdata->eps_inv[i].m00, - 1.0/mdata->eps_inv[i].m11); - } - printf("\n"); - } - - printf("Allocating fields...\n"); - H = create_evectmatrix(nx * ny * nz, 2, num_bands, - local_N, N_start, alloc_N); - Hstart = create_evectmatrix(nx * ny * nz, 2, num_bands, - local_N, N_start, alloc_N); - for (i = 0; i < NWORK; ++i) - W[i] = create_evectmatrix(nx * ny * nz, 2, num_bands, - local_N, N_start, alloc_N); - - CHK_MALLOC(eigvals, real, num_bands); - - for (iters = 0; iters < PROF_ITERS; ++iters) { - - printf("Initializing fields...\n"); - for (i = 0; i < H.n * H.p; ++i) - ASSIGN_REAL(Hstart.data[i], rand() * 1.0 / RAND_MAX); - - /*****************************************/ - if (do_target) { - printf("\nSolving for eigenvectors close to %f...\n", target_freq); - mtdata = create_maxwell_target_data(mdata, target_freq); - op = maxwell_target_operator; - if (which_preconditioner == 1) - pre_op = maxwell_target_preconditioner; - else - pre_op = maxwell_target_preconditioner2; - op_data = (void *) mtdata; - pre_op_data = (void *) mtdata; - } - else { - op = maxwell_operator; - if (which_preconditioner == 1) - pre_op = maxwell_preconditioner; - else - pre_op = maxwell_preconditioner2; - op_data = (void *) mdata; - pre_op_data = (void *) mdata; - } - - /*****************************************/ - printf("\nSolving for eigenvectors with preconditioning...\n"); - evectmatrix_copy(H, Hstart); - eigensolver(H, eigvals, - op, op_data, NULL,NULL, - pre_op, pre_op_data, - maxwell_parity_constraint, (void *) mdata, - W, NWORK, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); - - if (do_target) - eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, - W[0], W[1]); - - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("%15s%15s%15s%15s\n","eigenval", "frequency", "exact freq.", - "error"); - for (i = 0; i < num_bands; ++i) { - double err; - real freq = sqrt(eigvals[i]); - real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), - ed.eps_high_x, sqrt(ed.eps_low), - 1.0 - ed.eps_high_x, 1.0e-7); - printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, - err = fabs(freq - exact_freq) / exact_freq); - CHECK(err <= max_err, "error exceeds tolerance"); - } - printf("\n"); - - for (i = 0; i < num_bands; ++i) { - real kdom[3]; - real k; - maxwell_dominant_planewave(mdata, H, i + 1, kdom); - if ((i + 1) % 2 == 1) - k = kvector[0] + (i + 1) / 2; - else - k = kvector[0] - (i + 1) / 2; - if (kvector[0] > 0 && kvector[0] < 0.5 && ed.eps_high == 1) { - printf("Expected kdom: %15f%15f%15f\n", k, kvector[1], kvector[2]); - printf("Got kdom: %15f%15f%15f\n", kdom[0], kdom[1], kdom[2]); - CHECK(k == kdom[0] && kvector[1] == kdom[1] && kvector[2] == kdom[2], - "unexpected result from maxwell_dominant_planewave"); - } - } - } - - if (!stop1) { - - /*****************************************/ - - printf("\nSolving for eigenvectors without preconditioning...\n"); - evectmatrix_copy(H, Hstart); - eigensolver(H, eigvals, - op, op_data, NULL,NULL, - NULL, NULL, - maxwell_parity_constraint, (void *) mdata, - W, NWORK, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); - - if (do_target) - eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, - W[0], W[1]); - - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("%15s%15s%15s%15s\n","eigenval", "frequency", "exact freq.", - "error"); - for (i = 0; i < num_bands; ++i) { - double err; - real freq = sqrt(eigvals[i]); - real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), - ed.eps_high_x, sqrt(ed.eps_low), - 1.0 - ed.eps_high_x, 1.0e-7); - printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, - err = fabs(freq - exact_freq) / exact_freq); - CHECK(err <= max_err, "error exceeds tolerance"); - } - printf("\n"); - - /*****************************************/ - - printf("\nSolving for eigenvectors without conj. grad...\n"); - evectmatrix_copy(H, Hstart); - eigensolver(H, eigvals, - op, op_data, NULL,NULL, - pre_op, pre_op_data, - maxwell_parity_constraint, (void *) mdata, - W, NWORK - 1, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); - - if (do_target) - eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, - W[0], W[1]); - - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("%15s%15s%15s%15s\n","eigenval", "frequency", "exact freq.", - "error"); - for (i = 0; i < num_bands; ++i) { - double err; - real freq = sqrt(eigvals[i]); - real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), - ed.eps_high_x, sqrt(ed.eps_low), - 1.0 - ed.eps_high_x, 1.0e-7); - printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, - err = fabs(freq - exact_freq) / exact_freq); - CHECK(err <= max_err, "error exceeds tolerance"); - } - printf("\n"); - - /*****************************************/ - printf("\nSolving for eigenvectors without precond. or conj. grad...\n"); - evectmatrix_copy(H, Hstart); - eigensolver(H, eigvals, - op, op_data, - NULL, NULL, NULL,NULL, - maxwell_parity_constraint, (void *) mdata, - W, NWORK - 1, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); - - if (do_target) - eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, - W[0], W[1]); - - printf("Solved for eigenvectors after %d iterations.\n", num_iters); - printf("%15s%15s%15s%15s\n","eigenval", "frequency", "exact freq.", - "error"); - for (i = 0; i < num_bands; ++i) { - double err; - real freq = sqrt(eigvals[i]); - real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), - ed.eps_high_x, sqrt(ed.eps_low), - 1.0 - ed.eps_high_x, 1.0e-7); - printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, - err = fabs(freq - exact_freq) / exact_freq); - CHECK(err <= max_err, "error exceeds tolerance"); - } - printf("\n"); - - /*****************************************/ - - } - - destroy_evectmatrix(H); - destroy_evectmatrix(Hstart); - for (i = 0; i < NWORK; ++i) - destroy_evectmatrix(W[i]); - - destroy_maxwell_target_data(mtdata); - destroy_maxwell_data(mdata); - - free(eigvals); - - debug_check_memory_leaks(); - - return EXIT_SUCCESS; + mesh[0] = mesh[1] = mesh[2] = mesh_size; + + printf("Creating Maxwell data...\n"); + mdata = create_maxwell_data(nx, ny, nz, &local_N, &N_start, &alloc_N, num_bands, NUM_FFT_BANDS); + CHECK(mdata, "NULL mdata"); + + set_maxwell_data_parity(mdata, parity); + + printf("Setting k vector to (%g, %g, %g)...\n", kvector[0], kvector[1], kvector[2]); + update_maxwell_data_k(mdata, kvector, G[0], G[1], G[2]); + + printf("Initializing dielectric...\n"); + /* set up dielectric function (a simple Bragg mirror) */ + set_maxwell_dielectric(mdata, mesh, R, G, epsilon, 0, &ed); + + if (verbose && ny == 1 && nz == 1) { + printf("dielectric function:\n"); + for (i = 0; i < nx; ++i) { + if (mdata->eps_inv[i].m00 == mdata->eps_inv[i].m11) + printf(" eps(%g) = %g\n", i * 1.0 / nx, 1.0 / mdata->eps_inv[i].m00); + + else + printf(" eps(%g) = x: %g OR y: %g\n", i * 1.0 / nx, 1.0 / mdata->eps_inv[i].m00, + 1.0 / mdata->eps_inv[i].m11); + } + printf("\n"); + } + + printf("Allocating fields...\n"); + H = create_evectmatrix(nx * ny * nz, 2, num_bands, local_N, N_start, alloc_N); + Hstart = create_evectmatrix(nx * ny * nz, 2, num_bands, local_N, N_start, alloc_N); + for (i = 0; i < NWORK; ++i) + W[i] = create_evectmatrix(nx * ny * nz, 2, num_bands, local_N, N_start, alloc_N); + + CHK_MALLOC(eigvals, real, num_bands); + + for (iters = 0; iters < PROF_ITERS; ++iters) { + + printf("Initializing fields...\n"); + for (i = 0; i < H.n * H.p; ++i) + ASSIGN_REAL(Hstart.data[i], rand() * 1.0 / RAND_MAX); + + /*****************************************/ + if (do_target) { + printf("\nSolving for eigenvectors close to %f...\n", target_freq); + mtdata = create_maxwell_target_data(mdata, target_freq); + op = maxwell_target_operator; + if (which_preconditioner == 1) + pre_op = maxwell_target_preconditioner; + else + pre_op = maxwell_target_preconditioner2; + op_data = (void *)mtdata; + pre_op_data = (void *)mtdata; + } + else { + op = maxwell_operator; + if (which_preconditioner == 1) + pre_op = maxwell_preconditioner; + else + pre_op = maxwell_preconditioner2; + op_data = (void *)mdata; + pre_op_data = (void *)mdata; + } + + /*****************************************/ + printf("\nSolving for eigenvectors with preconditioning...\n"); + evectmatrix_copy(H, Hstart); + eigensolver(H, eigvals, op, op_data, NULL, NULL, pre_op, pre_op_data, maxwell_parity_constraint, + (void *)mdata, W, NWORK, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); + + if (do_target) eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, W[0], W[1]); + + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("%15s%15s%15s%15s\n", "eigenval", "frequency", "exact freq.", "error"); + for (i = 0; i < num_bands; ++i) { + double err; + real freq = sqrt(eigvals[i]); + real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), ed.eps_high_x, + sqrt(ed.eps_low), 1.0 - ed.eps_high_x, 1.0e-7); + printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, + err = fabs(freq - exact_freq) / exact_freq); + CHECK(err <= max_err, "error exceeds tolerance"); + } + printf("\n"); + + for (i = 0; i < num_bands; ++i) { + real kdom[3]; + real k; + maxwell_dominant_planewave(mdata, H, i + 1, kdom); + if ((i + 1) % 2 == 1) + k = kvector[0] + (i + 1) / 2; + else + k = kvector[0] - (i + 1) / 2; + if (kvector[0] > 0 && kvector[0] < 0.5 && ed.eps_high == 1) { + printf("Expected kdom: %15f%15f%15f\n", k, kvector[1], kvector[2]); + printf("Got kdom: %15f%15f%15f\n", kdom[0], kdom[1], kdom[2]); + CHECK(k == kdom[0] && kvector[1] == kdom[1] && kvector[2] == kdom[2], + "unexpected result from maxwell_dominant_planewave"); + } + } + } + + if (!stop1) { + + /*****************************************/ + + printf("\nSolving for eigenvectors without preconditioning...\n"); + evectmatrix_copy(H, Hstart); + eigensolver(H, eigvals, op, op_data, NULL, NULL, NULL, NULL, maxwell_parity_constraint, + (void *)mdata, W, NWORK, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); + + if (do_target) eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, W[0], W[1]); + + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("%15s%15s%15s%15s\n", "eigenval", "frequency", "exact freq.", "error"); + for (i = 0; i < num_bands; ++i) { + double err; + real freq = sqrt(eigvals[i]); + real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), ed.eps_high_x, + sqrt(ed.eps_low), 1.0 - ed.eps_high_x, 1.0e-7); + printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, + err = fabs(freq - exact_freq) / exact_freq); + CHECK(err <= max_err, "error exceeds tolerance"); + } + printf("\n"); + + /*****************************************/ + + printf("\nSolving for eigenvectors without conj. grad...\n"); + evectmatrix_copy(H, Hstart); + eigensolver(H, eigvals, op, op_data, NULL, NULL, pre_op, pre_op_data, maxwell_parity_constraint, + (void *)mdata, W, NWORK - 1, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); + + if (do_target) eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, W[0], W[1]); + + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("%15s%15s%15s%15s\n", "eigenval", "frequency", "exact freq.", "error"); + for (i = 0; i < num_bands; ++i) { + double err; + real freq = sqrt(eigvals[i]); + real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), ed.eps_high_x, + sqrt(ed.eps_low), 1.0 - ed.eps_high_x, 1.0e-7); + printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, + err = fabs(freq - exact_freq) / exact_freq); + CHECK(err <= max_err, "error exceeds tolerance"); + } + printf("\n"); + + /*****************************************/ + printf("\nSolving for eigenvectors without precond. or conj. grad...\n"); + evectmatrix_copy(H, Hstart); + eigensolver(H, eigvals, op, op_data, NULL, NULL, NULL, NULL, maxwell_parity_constraint, + (void *)mdata, W, NWORK - 1, error_tol, &num_iters, EIGS_DEFAULT_FLAGS); + + if (do_target) eigensolver_get_eigenvals(H, eigvals, maxwell_operator, mdata, W[0], W[1]); + + printf("Solved for eigenvectors after %d iterations.\n", num_iters); + printf("%15s%15s%15s%15s\n", "eigenval", "frequency", "exact freq.", "error"); + for (i = 0; i < num_bands; ++i) { + double err; + real freq = sqrt(eigvals[i]); + real exact_freq = bragg_omega(freq, kvector[0], sqrt(ed.eps_high), ed.eps_high_x, + sqrt(ed.eps_low), 1.0 - ed.eps_high_x, 1.0e-7); + printf("%15f%15f%15f%15e\n", eigvals[i], freq, exact_freq, + err = fabs(freq - exact_freq) / exact_freq); + CHECK(err <= max_err, "error exceeds tolerance"); + } + printf("\n"); + + /*****************************************/ + } + + destroy_evectmatrix(H); + destroy_evectmatrix(Hstart); + for (i = 0; i < NWORK; ++i) + destroy_evectmatrix(W[i]); + + destroy_maxwell_target_data(mtdata); + destroy_maxwell_data(mdata); + + free(eigvals); + + debug_check_memory_leaks(); + + return EXIT_SUCCESS; } diff --git a/tests/normal_vectors.c b/tests/normal_vectors.c index db1eb128..cb942f4a 100644 --- a/tests/normal_vectors.c +++ b/tests/normal_vectors.c @@ -11,17 +11,13 @@ #include /* return a random number in [0,1]: */ -static double mydrand(void) -{ - double d = rand(); - return (d / (double) RAND_MAX); +static double mydrand(void) { + double d = rand(); + return (d / (double)RAND_MAX); } /* return a uniform random number in [a,b] */ -static double myurand(double a, double b) -{ - return ((b - a) * mydrand() + a); -} +static double myurand(double a, double b) { return ((b - a) * mydrand() + a); } #define MAX_NSQ_PTS 72 #define NUM_PLANES 100000 @@ -29,256 +25,230 @@ static double myurand(double a, double b) #define K_PI 3.141592653589793238462643383279502884197 -static double pin(double x, double a, double b) -{ - return (x < a ? a : (x > b ? b : x)); -} +static double pin(double x, double a, double b) { return (x < a ? a : (x > b ? b : x)); } /* return the angle, in degrees, between two unit-normalized vectors */ -static double angle(vector3 v1, vector3 v2) -{ - double theta = 180/K_PI * acos(pin(vector3_dot(v1,v2), -1,1)); - return (theta > 90 ? 180 - theta : theta); +static double angle(vector3 v1, vector3 v2) { + double theta = 180 / K_PI * acos(pin(vector3_dot(v1, v2), -1, 1)); + return (theta > 90 ? 180 - theta : theta); } /* return a random unit vector, uniformly distributed over a sphere */ -vector3 random_unit_vector3(void) -{ - double z, t, r; - vector3 v; - - z = 2*mydrand() - 1; - t = 2*K_PI*mydrand(); - r = sqrt(1 - z*z); - v.x = r * cos(t); - v.y = r * sin(t); - v.z = z; - return v; +vector3 random_unit_vector3(void) { + double z, t, r; + vector3 v; + + z = 2 * mydrand() - 1; + t = 2 * K_PI * mydrand(); + r = sqrt(1 - z * z); + v.x = r * cos(t); + v.y = r * sin(t); + v.z = z; + return v; } -double find_edge(geometric_object o, vector3 dir, double max, double tol) -{ - double min = 0; - CHECK(point_in_fixed_objectp(vector3_scale(min, dir), o) && - !point_in_fixed_objectp(vector3_scale(max, dir), o), - "object out of bounds in find_edge"); - do { - double d = (min + max) / 2; - if (point_in_fixed_objectp(vector3_scale(d, dir), o)) - min = d; - else - max = d; - } while (max - min > tol); - return (min + max) / 2; +double find_edge(geometric_object o, vector3 dir, double max, double tol) { + double min = 0; + CHECK(point_in_fixed_objectp(vector3_scale(min, dir), o) && + !point_in_fixed_objectp(vector3_scale(max, dir), o), + "object out of bounds in find_edge"); + do { + double d = (min + max) / 2; + if (point_in_fixed_objectp(vector3_scale(d, dir), o)) + min = d; + else + max = d; + } while (max - min > tol); + return (min + max) / 2; } -double normal_err_to_object(geometric_object o, double r, vector3 dir) -{ - int i; - double d; - vector3 c, nsum = {0,0,0}; - dir = unit_vector3(dir); - d = find_edge(o, dir, 2, r * 0.01); - c = vector3_scale(d, dir); - for (i = 0; i < num_sphere_quad[2]; ++i) { - vector3 v; - v.x = sphere_quad[2][i][0] * r; - v.y = sphere_quad[2][i][1] * r; - v.z = sphere_quad[2][i][2] * r; - if (point_in_fixed_objectp(vector3_plus(v, c), o)) - nsum = vector3_plus(nsum, - vector3_scale(sphere_quad[2][i][3], v)); - } - CHECK(vector3_norm(nsum) > 1e-6, "couldn't get normal vector"); - return angle(unit_vector3(normal_to_object(c, o)), unit_vector3(nsum)); +double normal_err_to_object(geometric_object o, double r, vector3 dir) { + int i; + double d; + vector3 c, nsum = {0, 0, 0}; + dir = unit_vector3(dir); + d = find_edge(o, dir, 2, r * 0.01); + c = vector3_scale(d, dir); + for (i = 0; i < num_sphere_quad[2]; ++i) { + vector3 v; + v.x = sphere_quad[2][i][0] * r; + v.y = sphere_quad[2][i][1] * r; + v.z = sphere_quad[2][i][2] * r; + if (point_in_fixed_objectp(vector3_plus(v, c), o)) + nsum = vector3_plus(nsum, vector3_scale(sphere_quad[2][i][3], v)); + } + CHECK(vector3_norm(nsum) > 1e-6, "couldn't get normal vector"); + return angle(unit_vector3(normal_to_object(c, o)), unit_vector3(nsum)); } -void normals_to_object(geometric_object o, double r, int ntrials) -{ - int i; - double err_mean = 0, err_std = 0, err_max = 0; - display_geometric_object_info(0, o); - for (i = 0; i < ntrials; ++i) { - double dev; - double e = normal_err_to_object(o, r, random_unit_vector3()); - if (e > err_max) err_max = e; - /* stable one-pass formula for mean and std. deviation: */ - dev = (e - err_mean) / (i + 1); - err_mean += dev; - err_std += i*(i+1) * dev*dev; - } - err_std = sqrt(err_std / (ntrials - 1)); - printf("mean error angle for %d-pt formula = " - "%g +/- %g degrees, max error = %g degrees\n\n", - num_sphere_quad[2], err_mean, err_std, err_max); +void normals_to_object(geometric_object o, double r, int ntrials) { + int i; + double err_mean = 0, err_std = 0, err_max = 0; + display_geometric_object_info(0, o); + for (i = 0; i < ntrials; ++i) { + double dev; + double e = normal_err_to_object(o, r, random_unit_vector3()); + if (e > err_max) err_max = e; + /* stable one-pass formula for mean and std. deviation: */ + dev = (e - err_mean) / (i + 1); + err_mean += dev; + err_std += i * (i + 1) * dev * dev; + } + err_std = sqrt(err_std / (ntrials - 1)); + printf("mean error angle for %d-pt formula = " + "%g +/- %g degrees, max error = %g degrees\n\n", + num_sphere_quad[2], err_mean, err_std, err_max); } -static vector3 make_vector3(double x, double y, double z) -{ - vector3 v; - v.x = x; v.y = y; v.z = z; - return v; +static vector3 make_vector3(double x, double y, double z) { + vector3 v; + v.x = x; + v.y = y; + v.z = z; + return v; } /* return a random geometric object, centered at the origin, with diameter roughly 1 */ -geometric_object random_object(void) -{ - void* m = NULL; - vector3 c = { 0, 0, 0 }; - geometric_object o; - switch (rand() % 5) { - case 0: - o = make_sphere(m, c, myurand(0.5,1.5)); - break; - case 1: - o = make_cylinder(m, c, myurand(0.5,1.5), myurand(0.5,1.5), - random_unit_vector3()); - break; - case 2: - o = make_cone(m, c, myurand(0.5,1.5), myurand(0.5,1.5), - random_unit_vector3(), myurand(0.5,1.5)); - break; - case 3: - o = make_block(m, c, - random_unit_vector3(), - random_unit_vector3(), - random_unit_vector3(), - make_vector3(myurand(0.5,1.5), - myurand(0.5,1.5), - myurand(0.5,1.5))); - break; - case 4: - o = make_ellipsoid(m, c, - random_unit_vector3(), - random_unit_vector3(), - random_unit_vector3(), - make_vector3(myurand(0.5,1.5), - myurand(0.5,1.5), - myurand(0.5,1.5))); - break; - } - return o; +geometric_object random_object(void) { + void *m = NULL; + vector3 c = {0, 0, 0}; + geometric_object o; + switch (rand() % 5) { + case 0: o = make_sphere(m, c, myurand(0.5, 1.5)); break; + case 1: + o = make_cylinder(m, c, myurand(0.5, 1.5), myurand(0.5, 1.5), random_unit_vector3()); + break; + case 2: + o = make_cone(m, c, myurand(0.5, 1.5), myurand(0.5, 1.5), random_unit_vector3(), + myurand(0.5, 1.5)); + break; + case 3: + o = make_block(m, c, random_unit_vector3(), random_unit_vector3(), random_unit_vector3(), + make_vector3(myurand(0.5, 1.5), myurand(0.5, 1.5), myurand(0.5, 1.5))); + break; + case 4: + o = make_ellipsoid(m, c, random_unit_vector3(), random_unit_vector3(), random_unit_vector3(), + make_vector3(myurand(0.5, 1.5), myurand(0.5, 1.5), myurand(0.5, 1.5))); + break; + } + return o; } -int main(void) -{ - int i, j; - double err_mean, err_std, err_max; - double min_angle = 360; - int missed; - - srand(time(NULL)); - - printf("Testing spherical quadratures for normals to %d surfaces.\n\n", - NUM_PLANES); - - /* compute the minimum angle between pairs of points: */ - for (i = 0; i < num_sphere_quad[2]; ++i) - for (j = i + 1; j < num_sphere_quad[2]; ++j) { - vector3 v1, v2; - double a; - v1.x = sphere_quad[2][i][0]; - v1.y = sphere_quad[2][i][1]; - v1.z = sphere_quad[2][i][2]; - v2.x = sphere_quad[2][j][0]; - v2.y = sphere_quad[2][j][1]; - v2.z = sphere_quad[2][j][2]; - a = angle(v1,v2); - if (a < min_angle && a > 1e-6) - min_angle = a; - } - printf("%d-point formula: minimum angle is %g degrees.\n", - num_sphere_quad[2], min_angle); - - /* Normals to planes: */ - err_mean = err_std = err_max = 0.0; - missed = 0; - for (i = 0; i < NUM_PLANES; ++i) { - vector3 n, nsum = {0,0,0}; - double d; - - n = random_unit_vector3(); - d = mydrand(); - for (j = 0; j < num_sphere_quad[2]; ++j) { - vector3 v; - real val; - v.x = sphere_quad[2][j][0]; - v.y = sphere_quad[2][j][1]; - v.z = sphere_quad[2][j][2]; - val = vector3_dot(v,n) >= d ? 12.0 : 1.0; - val *= sphere_quad[2][j][3]; - nsum = vector3_plus(nsum, vector3_scale(val, v)); - } - if (vector3_norm(nsum) < 1e-6) { - ++missed; --i; - continue; - } - nsum = unit_vector3(nsum); - { /* stable one-pass formula for mean and std. deviation: */ - double e, dev; - e = angle(n, nsum); - if (e > err_max) err_max = e; - dev = (e - err_mean) / (i + 1); - err_mean += dev; - err_std += i*(i+1) * dev*dev; - } - } - err_std = sqrt(err_std / (NUM_PLANES - 1)); - printf("planes: mean error angle for %d-pt formula = " - "%g +/- %g degrees, max error = %g degrees\n", - num_sphere_quad[2], err_mean, err_std, err_max); - printf("(Fraction missed = %g)\n", - missed * 1.0 / (NUM_PLANES + missed)); - - /* Normals to spheres: */ - err_mean = err_std = 0.0; - missed=0; - for (i = 0; i < NUM_PLANES; ++i) { - vector3 n, nsum = {0,0,0}, c; - double r, d; - int j; - - n = random_unit_vector3(); - d = mydrand() * 0.8 + 0.1; - r = 1.0 + mydrand() * 10; /* radius of the sphere */ - c = vector3_scale(r + d, n); /* center of the sphere */ - for (j = 0; j < num_sphere_quad[2]; ++j) { - vector3 v; - real val; - v.x = sphere_quad[2][j][0]; - v.y = sphere_quad[2][j][1]; - v.z = sphere_quad[2][j][2]; - val = vector3_norm(vector3_minus(c,v)) <= r ? 12.0 : 1.0; - val *= sphere_quad[2][j][3]; - nsum = vector3_plus(nsum, vector3_scale(val, v)); - } - nsum = unit_vector3(nsum); - if (vector3_norm(nsum) < 1e-6) { - --i; - continue; - } - { /* stable one-pass formula for mean and std. deviation: */ - double e, dev; - e = angle(n, nsum); - if (e > err_max) err_max = e; - dev = (e - err_mean) / (i + 1); - err_mean += dev; - err_std += i*(i+1) * dev*dev; - } - } - err_std = sqrt(err_std / (NUM_PLANES - 1)); - printf("spheres: mean error angle for %d-pt formula = " - "%g +/- %g degrees, max error = %g degrees\n", - num_sphere_quad[2], err_mean, err_std, err_max); - - printf("\n"); - - for (i = 0; i < NUM_OBJECTS; ++i) { - geometric_object o = random_object(); - normals_to_object(o, 0.01, NUM_PLANES/100); - geometric_object_destroy(o); - } - - return EXIT_SUCCESS; +int main(void) { + int i, j; + double err_mean, err_std, err_max; + double min_angle = 360; + int missed; + + srand(time(NULL)); + + printf("Testing spherical quadratures for normals to %d surfaces.\n\n", NUM_PLANES); + + /* compute the minimum angle between pairs of points: */ + for (i = 0; i < num_sphere_quad[2]; ++i) + for (j = i + 1; j < num_sphere_quad[2]; ++j) { + vector3 v1, v2; + double a; + v1.x = sphere_quad[2][i][0]; + v1.y = sphere_quad[2][i][1]; + v1.z = sphere_quad[2][i][2]; + v2.x = sphere_quad[2][j][0]; + v2.y = sphere_quad[2][j][1]; + v2.z = sphere_quad[2][j][2]; + a = angle(v1, v2); + if (a < min_angle && a > 1e-6) min_angle = a; + } + printf("%d-point formula: minimum angle is %g degrees.\n", num_sphere_quad[2], min_angle); + + /* Normals to planes: */ + err_mean = err_std = err_max = 0.0; + missed = 0; + for (i = 0; i < NUM_PLANES; ++i) { + vector3 n, nsum = {0, 0, 0}; + double d; + + n = random_unit_vector3(); + d = mydrand(); + for (j = 0; j < num_sphere_quad[2]; ++j) { + vector3 v; + real val; + v.x = sphere_quad[2][j][0]; + v.y = sphere_quad[2][j][1]; + v.z = sphere_quad[2][j][2]; + val = vector3_dot(v, n) >= d ? 12.0 : 1.0; + val *= sphere_quad[2][j][3]; + nsum = vector3_plus(nsum, vector3_scale(val, v)); + } + if (vector3_norm(nsum) < 1e-6) { + ++missed; + --i; + continue; + } + nsum = unit_vector3(nsum); + { /* stable one-pass formula for mean and std. deviation: */ + double e, dev; + e = angle(n, nsum); + if (e > err_max) err_max = e; + dev = (e - err_mean) / (i + 1); + err_mean += dev; + err_std += i * (i + 1) * dev * dev; + } + } + err_std = sqrt(err_std / (NUM_PLANES - 1)); + printf("planes: mean error angle for %d-pt formula = " + "%g +/- %g degrees, max error = %g degrees\n", + num_sphere_quad[2], err_mean, err_std, err_max); + printf("(Fraction missed = %g)\n", missed * 1.0 / (NUM_PLANES + missed)); + + /* Normals to spheres: */ + err_mean = err_std = 0.0; + missed = 0; + for (i = 0; i < NUM_PLANES; ++i) { + vector3 n, nsum = {0, 0, 0}, c; + double r, d; + int j; + + n = random_unit_vector3(); + d = mydrand() * 0.8 + 0.1; + r = 1.0 + mydrand() * 10; /* radius of the sphere */ + c = vector3_scale(r + d, n); /* center of the sphere */ + for (j = 0; j < num_sphere_quad[2]; ++j) { + vector3 v; + real val; + v.x = sphere_quad[2][j][0]; + v.y = sphere_quad[2][j][1]; + v.z = sphere_quad[2][j][2]; + val = vector3_norm(vector3_minus(c, v)) <= r ? 12.0 : 1.0; + val *= sphere_quad[2][j][3]; + nsum = vector3_plus(nsum, vector3_scale(val, v)); + } + nsum = unit_vector3(nsum); + if (vector3_norm(nsum) < 1e-6) { + --i; + continue; + } + { /* stable one-pass formula for mean and std. deviation: */ + double e, dev; + e = angle(n, nsum); + if (e > err_max) err_max = e; + dev = (e - err_mean) / (i + 1); + err_mean += dev; + err_std += i * (i + 1) * dev * dev; + } + } + err_std = sqrt(err_std / (NUM_PLANES - 1)); + printf("spheres: mean error angle for %d-pt formula = " + "%g +/- %g degrees, max error = %g degrees\n", + num_sphere_quad[2], err_mean, err_std, err_max); + + printf("\n"); + + for (i = 0; i < NUM_OBJECTS; ++i) { + geometric_object o = random_object(); + normals_to_object(o, 0.01, NUM_PLANES / 100); + geometric_object_destroy(o); + } + + return EXIT_SUCCESS; } diff --git a/utils/mpb-data.c b/utils/mpb-data.c index 9031de4e..c8e219b8 100644 --- a/utils/mpb-data.c +++ b/utils/mpb-data.c @@ -35,860 +35,730 @@ int verbose = 0; below, because x may start out as -0 or -1e-23 or something so that it is < 0 but x + 1.0 == 1.0, thanks to the wonders of floating point. (This has actually happened, on an Alpha.) */ -#define MODF_POSITIVE(x, xi) { \ - x=modf(x, &xi); \ - if (x < 0) { x += 1.0; if (x >= 1.0) x = 0; else xi -= 1.0; } \ -} - -#define ADJ_POINT(i1, i2, nx, dx, xi, xi2) { \ - if (dx >= 0.0) { \ - i2 = i1 + 1; \ - if (i2 >= nx) { \ - i2 -= nx; \ - xi2 = xi + 1.0; \ - } \ - else \ - xi2 = xi; \ - } \ - else { \ - i2 = i1 - 1; \ - if (i2 < 0) { \ - i2 += nx; \ - xi2 = xi - 1.0; \ - } \ - else \ - xi2 = xi; \ - dx = -dx; \ - } \ -} - -void add_cmplx_times_phase(real *sum_re, real *sum_im, - real d_re, real d_im, - double ix, double iy, double iz, real *s, - real scale_by) -{ - static real phase = 0.0, p_re = 1.0, p_im = 0.0; - real new_phase; - - new_phase = ix * s[0] + iy * s[1] + iz * s[2]; - if (new_phase != phase) { - phase = new_phase; - p_re = cos(phase); - p_im = sin(phase); - } - *sum_re += (d_re * p_re - d_im * p_im) * scale_by; - *sum_im += (d_re * p_im + d_im * p_re) * scale_by; +#define MODF_POSITIVE(x, xi) \ + { \ + x = modf(x, &xi); \ + if (x < 0) { \ + x += 1.0; \ + if (x >= 1.0) \ + x = 0; \ + else \ + xi -= 1.0; \ + } \ + } + +#define ADJ_POINT(i1, i2, nx, dx, xi, xi2) \ + { \ + if (dx >= 0.0) { \ + i2 = i1 + 1; \ + if (i2 >= nx) { \ + i2 -= nx; \ + xi2 = xi + 1.0; \ + } \ + else \ + xi2 = xi; \ + } \ + else { \ + i2 = i1 - 1; \ + if (i2 < 0) { \ + i2 += nx; \ + xi2 = xi - 1.0; \ + } \ + else \ + xi2 = xi; \ + dx = -dx; \ + } \ + } + +void add_cmplx_times_phase(real *sum_re, real *sum_im, real d_re, real d_im, double ix, double iy, + double iz, real *s, real scale_by) { + static real phase = 0.0, p_re = 1.0, p_im = 0.0; + real new_phase; + + new_phase = ix * s[0] + iy * s[1] + iz * s[2]; + if (new_phase != phase) { + phase = new_phase; + p_re = cos(phase); + p_im = sin(phase); + } + *sum_re += (d_re * p_re - d_im * p_im) * scale_by; + *sum_im += (d_re * p_im + d_im * p_re) * scale_by; } #define TWOPI 6.2831853071795864769252867665590057683943388 -#define MAX2(a,b) ((a) >= (b) ? (a) : (b)) -#define MIN2(a,b) ((a) < (b) ? (a) : (b)) - -void map_data(real *d_in_re, real *d_in_im, int n_in[3], - real *d_out_re, real *d_out_im, int n_out[3], - matrix3x3 coord_map, - real *kvector, - short pick_nearest, short transpose) -{ - int i, j, k; - real s[3]; /* phase difference per cell in each lattice direction */ - real min_out_re = 1e20, max_out_re = -1e20, - min_out_im = 1e20, max_out_im = -1e20; - real shiftx, shifty, shiftz; - - CHECK(d_in_re && d_out_re, "invalid arguments"); - CHECK((d_out_im && d_in_im) || (!d_out_im && !d_in_im), - "both input and output must be real or complex"); - - coord_map.c0 = vector3_scale(1.0 / n_out[0], coord_map.c0); - coord_map.c1 = vector3_scale(1.0 / n_out[1], coord_map.c1); - coord_map.c2 = vector3_scale(1.0 / n_out[2], coord_map.c2); - - for (i = 0; i < 3; ++i) { - if (kvector) - s[i] = kvector[i] * TWOPI; - else - s[i] = 0; - } - - /* Compute shift so that the origin of the output cell - is mapped to the origin of the original primitive cell: */ - shiftx = 0.5 - (coord_map.c0.x*0.5*n_out[0] + - coord_map.c1.x*0.5*n_out[1] + - coord_map.c2.x*0.5*n_out[2]); - shifty = 0.5 - (coord_map.c0.y*0.5*n_out[0] + - coord_map.c1.y*0.5*n_out[1] + - coord_map.c2.y*0.5*n_out[2]); - shiftz = 0.5 - (coord_map.c0.z*0.5*n_out[0] + - coord_map.c1.z*0.5*n_out[1] + - coord_map.c2.z*0.5*n_out[2]); - - for (i = 0; i < n_out[0]; ++i) - for (j = 0; j < n_out[1]; ++j) - for (k = 0; k < n_out[2]; ++k) { - real x, y, z; - double xi, yi, zi, xi2, yi2, zi2; - double dx, dy, dz, mdx, mdy, mdz; - int i1, j1, k1, i2, j2, k2; - int ijk; - - if (transpose) - ijk = (j * n_out[0] + i) * n_out[2] + k; - else - ijk = (i * n_out[1] + j) * n_out[2] + k; - - /* find the point corresponding to d_out[i,j,k] in - the input array, and also find the next-nearest - points. */ - x = coord_map.c0.x*i + coord_map.c1.x*j + coord_map.c2.x*k - + shiftx; - y = coord_map.c0.y*i + coord_map.c1.y*j + coord_map.c2.y*k - + shifty; - z = coord_map.c0.z*i + coord_map.c1.z*j + coord_map.c2.z*k - + shiftz; - MODF_POSITIVE(x, xi); - MODF_POSITIVE(y, yi); - MODF_POSITIVE(z, zi); - i1 = x * n_in[0]; j1 = y * n_in[1]; k1 = z * n_in[2]; - dx = x * n_in[0] - i1; - dy = y * n_in[1] - j1; - dz = z * n_in[2] - k1; - ADJ_POINT(i1, i2, n_in[0], dx, xi, xi2); - ADJ_POINT(j1, j2, n_in[1], dy, yi, yi2); - ADJ_POINT(k1, k2, n_in[2], dz, zi, zi2); - - /* dx, mdx, etcetera, are the weights for the various - points in the input data, which we use for linearly - interpolating to get the output point. */ - if (pick_nearest) { - /* don't interpolate */ - dx = dx <= 0.5 ? 0.0 : 1.0; - dy = dy <= 0.5 ? 0.0 : 1.0; - dz = dz <= 0.5 ? 0.0 : 1.0; - } - mdx = 1.0 - dx; - mdy = 1.0 - dy; - mdz = 1.0 - dz; - - /* Now, linearly interpolate the input to get the - output. If the input/output are complex, we - also need to multiply by the appropriate phase - factor, depending upon which unit cell we are in. */ - -#define IN_INDEX(i,j,k) ((i * n_in[1] + j) * n_in[2] + k) - if (d_out_im) { - d_out_re[ijk] = 0.0; - d_out_im[ijk] = 0.0; - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i1,j1,k1)], - d_in_im[IN_INDEX(i1,j1,k1)], - xi, yi, zi, s, - mdx * mdy * mdz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i1,j1,k2)], - d_in_im[IN_INDEX(i1,j1,k2)], - xi, yi, zi2, s, - mdx * mdy * dz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i1,j2,k1)], - d_in_im[IN_INDEX(i1,j2,k1)], - xi, yi2, zi, s, - mdx * dy * mdz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i1,j2,k2)], - d_in_im[IN_INDEX(i1,j2,k2)], - xi, yi2, zi2, s, - mdx * dy * dz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i2,j1,k1)], - d_in_im[IN_INDEX(i2,j1,k1)], - xi2, yi, zi, s, - dx * mdy * mdz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i2,j1,k2)], - d_in_im[IN_INDEX(i2,j1,k2)], - xi2, yi, zi2, s, - dx * mdy * dz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i2,j2,k1)], - d_in_im[IN_INDEX(i2,j2,k1)], - xi2, yi2, zi, s, - dx * dy * mdz); - add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, - d_in_re[IN_INDEX(i2,j2,k2)], - d_in_im[IN_INDEX(i2,j2,k2)], - xi2, yi2, zi2, s, - dx * dy * dz); - min_out_im = MIN2(min_out_im, d_out_im[ijk]); - max_out_im = MAX2(max_out_im, d_out_im[ijk]); - } - else { - d_out_re[ijk] = - d_in_re[IN_INDEX(i1,j1,k1)] * mdx * mdy * mdz + - d_in_re[IN_INDEX(i1,j1,k2)] * mdx * mdy * dz + - d_in_re[IN_INDEX(i1,j2,k1)] * mdx * dy * mdz + - d_in_re[IN_INDEX(i1,j2,k2)] * mdx * dy * dz + - d_in_re[IN_INDEX(i2,j1,k1)] * dx * mdy * mdz + - d_in_re[IN_INDEX(i2,j1,k2)] * dx * mdy * dz + - d_in_re[IN_INDEX(i2,j2,k1)] * dx * dy * mdz + - d_in_re[IN_INDEX(i2,j2,k2)] * dx * dy * dz; - } - min_out_re = MIN2(min_out_re, d_out_re[ijk]); - max_out_re = MAX2(max_out_re, d_out_re[ijk]); +#define MAX2(a, b) ((a) >= (b) ? (a) : (b)) +#define MIN2(a, b) ((a) < (b) ? (a) : (b)) + +void map_data(real *d_in_re, real *d_in_im, int n_in[3], real *d_out_re, real *d_out_im, + int n_out[3], matrix3x3 coord_map, real *kvector, short pick_nearest, + short transpose) { + int i, j, k; + real s[3]; /* phase difference per cell in each lattice direction */ + real min_out_re = 1e20, max_out_re = -1e20, min_out_im = 1e20, max_out_im = -1e20; + real shiftx, shifty, shiftz; + + CHECK(d_in_re && d_out_re, "invalid arguments"); + CHECK((d_out_im && d_in_im) || (!d_out_im && !d_in_im), + "both input and output must be real or complex"); + + coord_map.c0 = vector3_scale(1.0 / n_out[0], coord_map.c0); + coord_map.c1 = vector3_scale(1.0 / n_out[1], coord_map.c1); + coord_map.c2 = vector3_scale(1.0 / n_out[2], coord_map.c2); + + for (i = 0; i < 3; ++i) { + if (kvector) + s[i] = kvector[i] * TWOPI; + else + s[i] = 0; + } + + /* Compute shift so that the origin of the output cell + is mapped to the origin of the original primitive cell: */ + shiftx = 0.5 - (coord_map.c0.x * 0.5 * n_out[0] + coord_map.c1.x * 0.5 * n_out[1] + + coord_map.c2.x * 0.5 * n_out[2]); + shifty = 0.5 - (coord_map.c0.y * 0.5 * n_out[0] + coord_map.c1.y * 0.5 * n_out[1] + + coord_map.c2.y * 0.5 * n_out[2]); + shiftz = 0.5 - (coord_map.c0.z * 0.5 * n_out[0] + coord_map.c1.z * 0.5 * n_out[1] + + coord_map.c2.z * 0.5 * n_out[2]); + + for (i = 0; i < n_out[0]; ++i) + for (j = 0; j < n_out[1]; ++j) + for (k = 0; k < n_out[2]; ++k) { + real x, y, z; + double xi, yi, zi, xi2, yi2, zi2; + double dx, dy, dz, mdx, mdy, mdz; + int i1, j1, k1, i2, j2, k2; + int ijk; + + if (transpose) + ijk = (j * n_out[0] + i) * n_out[2] + k; + else + ijk = (i * n_out[1] + j) * n_out[2] + k; + + /* find the point corresponding to d_out[i,j,k] in + the input array, and also find the next-nearest + points. */ + x = coord_map.c0.x * i + coord_map.c1.x * j + coord_map.c2.x * k + shiftx; + y = coord_map.c0.y * i + coord_map.c1.y * j + coord_map.c2.y * k + shifty; + z = coord_map.c0.z * i + coord_map.c1.z * j + coord_map.c2.z * k + shiftz; + MODF_POSITIVE(x, xi); + MODF_POSITIVE(y, yi); + MODF_POSITIVE(z, zi); + i1 = x * n_in[0]; + j1 = y * n_in[1]; + k1 = z * n_in[2]; + dx = x * n_in[0] - i1; + dy = y * n_in[1] - j1; + dz = z * n_in[2] - k1; + ADJ_POINT(i1, i2, n_in[0], dx, xi, xi2); + ADJ_POINT(j1, j2, n_in[1], dy, yi, yi2); + ADJ_POINT(k1, k2, n_in[2], dz, zi, zi2); + + /* dx, mdx, etcetera, are the weights for the various + points in the input data, which we use for linearly + interpolating to get the output point. */ + if (pick_nearest) { + /* don't interpolate */ + dx = dx <= 0.5 ? 0.0 : 1.0; + dy = dy <= 0.5 ? 0.0 : 1.0; + dz = dz <= 0.5 ? 0.0 : 1.0; + } + mdx = 1.0 - dx; + mdy = 1.0 - dy; + mdz = 1.0 - dz; + + /* Now, linearly interpolate the input to get the + output. If the input/output are complex, we + also need to multiply by the appropriate phase + factor, depending upon which unit cell we are in. */ + +#define IN_INDEX(i, j, k) ((i * n_in[1] + j) * n_in[2] + k) + if (d_out_im) { + d_out_re[ijk] = 0.0; + d_out_im[ijk] = 0.0; + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i1, j1, k1)], + d_in_im[IN_INDEX(i1, j1, k1)], xi, yi, zi, s, mdx * mdy * mdz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i1, j1, k2)], + d_in_im[IN_INDEX(i1, j1, k2)], xi, yi, zi2, s, mdx * mdy * dz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i1, j2, k1)], + d_in_im[IN_INDEX(i1, j2, k1)], xi, yi2, zi, s, mdx * dy * mdz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i1, j2, k2)], + d_in_im[IN_INDEX(i1, j2, k2)], xi, yi2, zi2, s, mdx * dy * dz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i2, j1, k1)], + d_in_im[IN_INDEX(i2, j1, k1)], xi2, yi, zi, s, dx * mdy * mdz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i2, j1, k2)], + d_in_im[IN_INDEX(i2, j1, k2)], xi2, yi, zi2, s, dx * mdy * dz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i2, j2, k1)], + d_in_im[IN_INDEX(i2, j2, k1)], xi2, yi2, zi, s, dx * dy * mdz); + add_cmplx_times_phase(d_out_re + ijk, d_out_im + ijk, d_in_re[IN_INDEX(i2, j2, k2)], + d_in_im[IN_INDEX(i2, j2, k2)], xi2, yi2, zi2, s, dx * dy * dz); + min_out_im = MIN2(min_out_im, d_out_im[ijk]); + max_out_im = MAX2(max_out_im, d_out_im[ijk]); + } + else { + d_out_re[ijk] = d_in_re[IN_INDEX(i1, j1, k1)] * mdx * mdy * mdz + + d_in_re[IN_INDEX(i1, j1, k2)] * mdx * mdy * dz + + d_in_re[IN_INDEX(i1, j2, k1)] * mdx * dy * mdz + + d_in_re[IN_INDEX(i1, j2, k2)] * mdx * dy * dz + + d_in_re[IN_INDEX(i2, j1, k1)] * dx * mdy * mdz + + d_in_re[IN_INDEX(i2, j1, k2)] * dx * mdy * dz + + d_in_re[IN_INDEX(i2, j2, k1)] * dx * dy * mdz + + d_in_re[IN_INDEX(i2, j2, k2)] * dx * dy * dz; + } + min_out_re = MIN2(min_out_re, d_out_re[ijk]); + max_out_re = MAX2(max_out_re, d_out_re[ijk]); #undef IN_INDEX - } + } - if (verbose) { - printf("real part range: %g .. %g\n", min_out_re, max_out_re); - if (d_out_im) - printf("imag part range: %g .. %g\n", min_out_im, max_out_im); - } + if (verbose) { + printf("real part range: %g .. %g\n", min_out_re, max_out_re); + if (d_out_im) printf("imag part range: %g .. %g\n", min_out_im, max_out_im); + } } -void handle_dataset(matrixio_id in_file, matrixio_id out_file, - const char *name_re, const char *name_im, - matrix3x3 Rout, matrix3x3 coord_map, - real *kvector, double resolution, - scalar_complex scaleby, real multiply_size[3], - int pick_nearest, int transpose) -{ - real *d_in_re = NULL, *d_in_im = NULL, *d_out_re = NULL, *d_out_im = NULL; - int in_dims[3] = {1,1,1}, out_dims[3] = {1,1,1}, out_dims2[3], rank = 3; - int i, N; - int start[3] = {0,0,0}; - matrixio_id data_id; - char out_name[1000]; - - d_in_re = matrixio_read_real_data(in_file, name_re, &rank, in_dims, - 0, 0, 0, NULL); - if (!d_in_re) - goto done; - - if (verbose) - printf("Found dataset %s...\n", name_re); - - if (name_im) { - d_in_im = matrixio_read_real_data(in_file, name_im, &rank, out_dims, - 0, 0, 0, NULL); - if (!d_in_im) { - fprintf(stderr, "mpb-data: found %s dataset but not %s\n", - name_re, name_im); - goto done; - } - - for (i = 0; i < 3; ++i) { - CHECK(out_dims[i] == in_dims[i], - "re/im datasets must have same size!"); - } - - if (verbose) - printf(" and imaginary part dataset %s...\n", name_im); - - } - - if (verbose) - printf("Input data is rank %d, size %dx%dx%d.\n", - rank, in_dims[0], in_dims[1], in_dims[2]); - - if (resolution > 0) { - out_dims[0] = vector3_norm(Rout.c0) * resolution + 0.5; - out_dims[1] = vector3_norm(Rout.c1) * resolution + 0.5; - out_dims[2] = vector3_norm(Rout.c2) * resolution + 0.5; - } - else { - for (i = 0; i < 3; ++i) - out_dims[i] = in_dims[i] * multiply_size[i]; - } - for (i = rank; i < 3; ++i) - out_dims[i] = 1; - for (N = 1, i = 0; i < 3; ++i) - N *= (out_dims[i] = MAX2(out_dims[i], 1)); - - if (transpose) { - out_dims2[0] = out_dims[1]; - out_dims2[1] = out_dims[0]; - out_dims2[2] = out_dims[2]; - } - else { - out_dims2[0] = out_dims[0]; - out_dims2[1] = out_dims[1]; - out_dims2[2] = out_dims[2]; - } - - if (verbose) - printf("Output data %dx%dx%d.\n", - out_dims2[0], out_dims2[1], out_dims2[2]); - - CHK_MALLOC(d_out_re, real, N); - if (d_in_im) { - CHK_MALLOC(d_out_im, real, N); - } - - map_data(d_in_re, d_in_im, in_dims, d_out_re, d_out_im, out_dims, - coord_map, kvector, pick_nearest, transpose); - - if (d_out_im) { /* multiply * scaleby for complex data */ - for (i = 0; i < N; ++i) { - scalar_complex d; - CASSIGN_SCALAR(d, d_out_re[i], d_out_im[i]); - CASSIGN_MULT(d, scaleby, d); - d_out_re[i] = CSCALAR_RE(d); - d_out_im[i] = CSCALAR_IM(d); - } - } - - strcpy(out_name, name_re); - if (out_file.id == in_file.id) - strcat(out_name, "-new"); - if (verbose) - printf("Writing dataset to %s...\n", out_name); - data_id = matrixio_create_dataset(out_file, out_name,"", rank, out_dims2); - matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_re); - matrixio_close_dataset(data_id); - - if (d_out_im) { - strcpy(out_name, name_im); - if (out_file.id == in_file.id) - strcat(out_name, "-new"); - if (verbose) - printf("Writing dataset to %s...\n", out_name); - data_id = matrixio_create_dataset(out_file, out_name, "", - rank, out_dims2); - matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_im); - matrixio_close_dataset(data_id); - } - - if (verbose) - printf("Successfully wrote out data.\n"); - - done: - free(d_in_re); - free(d_in_im); - free(d_out_re); - free(d_out_im); +void handle_dataset(matrixio_id in_file, matrixio_id out_file, const char *name_re, + const char *name_im, matrix3x3 Rout, matrix3x3 coord_map, real *kvector, + double resolution, scalar_complex scaleby, real multiply_size[3], + int pick_nearest, int transpose) { + real *d_in_re = NULL, *d_in_im = NULL, *d_out_re = NULL, *d_out_im = NULL; + int in_dims[3] = {1, 1, 1}, out_dims[3] = {1, 1, 1}, out_dims2[3], rank = 3; + int i, N; + int start[3] = {0, 0, 0}; + matrixio_id data_id; + char out_name[1000]; + + d_in_re = matrixio_read_real_data(in_file, name_re, &rank, in_dims, 0, 0, 0, NULL); + if (!d_in_re) goto done; + + if (verbose) printf("Found dataset %s...\n", name_re); + + if (name_im) { + d_in_im = matrixio_read_real_data(in_file, name_im, &rank, out_dims, 0, 0, 0, NULL); + if (!d_in_im) { + fprintf(stderr, "mpb-data: found %s dataset but not %s\n", name_re, name_im); + goto done; + } + + for (i = 0; i < 3; ++i) { + CHECK(out_dims[i] == in_dims[i], "re/im datasets must have same size!"); + } + + if (verbose) printf(" and imaginary part dataset %s...\n", name_im); + } + + if (verbose) + printf("Input data is rank %d, size %dx%dx%d.\n", rank, in_dims[0], in_dims[1], in_dims[2]); + + if (resolution > 0) { + out_dims[0] = vector3_norm(Rout.c0) * resolution + 0.5; + out_dims[1] = vector3_norm(Rout.c1) * resolution + 0.5; + out_dims[2] = vector3_norm(Rout.c2) * resolution + 0.5; + } + else { + for (i = 0; i < 3; ++i) + out_dims[i] = in_dims[i] * multiply_size[i]; + } + for (i = rank; i < 3; ++i) + out_dims[i] = 1; + for (N = 1, i = 0; i < 3; ++i) + N *= (out_dims[i] = MAX2(out_dims[i], 1)); + + if (transpose) { + out_dims2[0] = out_dims[1]; + out_dims2[1] = out_dims[0]; + out_dims2[2] = out_dims[2]; + } + else { + out_dims2[0] = out_dims[0]; + out_dims2[1] = out_dims[1]; + out_dims2[2] = out_dims[2]; + } + + if (verbose) printf("Output data %dx%dx%d.\n", out_dims2[0], out_dims2[1], out_dims2[2]); + + CHK_MALLOC(d_out_re, real, N); + if (d_in_im) { CHK_MALLOC(d_out_im, real, N); } + + map_data(d_in_re, d_in_im, in_dims, d_out_re, d_out_im, out_dims, coord_map, kvector, + pick_nearest, transpose); + + if (d_out_im) { /* multiply * scaleby for complex data */ + for (i = 0; i < N; ++i) { + scalar_complex d; + CASSIGN_SCALAR(d, d_out_re[i], d_out_im[i]); + CASSIGN_MULT(d, scaleby, d); + d_out_re[i] = CSCALAR_RE(d); + d_out_im[i] = CSCALAR_IM(d); + } + } + + strcpy(out_name, name_re); + if (out_file.id == in_file.id) strcat(out_name, "-new"); + if (verbose) printf("Writing dataset to %s...\n", out_name); + data_id = matrixio_create_dataset(out_file, out_name, "", rank, out_dims2); + matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_re); + matrixio_close_dataset(data_id); + + if (d_out_im) { + strcpy(out_name, name_im); + if (out_file.id == in_file.id) strcat(out_name, "-new"); + if (verbose) printf("Writing dataset to %s...\n", out_name); + data_id = matrixio_create_dataset(out_file, out_name, "", rank, out_dims2); + matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_im); + matrixio_close_dataset(data_id); + } + + if (verbose) printf("Successfully wrote out data.\n"); + +done: + free(d_in_re); + free(d_in_im); + free(d_out_re); + free(d_out_im); } -void handle_cvector_dataset(matrixio_id in_file, matrixio_id out_file, - matrix3x3 Rout, - matrix3x3 coord_map, - matrix3x3 cart_map, - real *kvector, - double resolution, - scalar_complex scaleby, - real multiply_size[3], - int pick_nearest, int transpose) -{ - real *d_in[3][2] = { {0,0},{0,0},{0,0} }; - int in_dims[3] = {1,1,1}, out_dims[3] = {1,1,1}, out_dims2[3], rank = 3; - int i, N, dim, ri; - int start[3] = {0,0,0}; - matrixio_id data_id; - - for (dim = 0; dim < 3; ++dim) - for (ri = 0; ri < 2; ++ri) { - char nam[] = "x.r"; - int dims[3] = {1,1,1}, rnk = 3; - - nam[0] = 'x' + dim; - nam[2] = ri ? 'i' : 'r'; - d_in[dim][ri] - = matrixio_read_real_data(in_file, nam, &rnk, dims, - 0, 0, 0, NULL); - if (!d_in[dim][ri]) - goto bad; - if (!dim && !ri) { - rank = rnk; - for (i = 0; i < 3; ++i) in_dims[i] = dims[i]; - } - else { - if (rank != rnk || in_dims[0] != dims[0] || - in_dims[1] != dims[1] || in_dims[2] != dims[2]) - goto bad; - } - } - - if (verbose) - printf("Found complex vector dataset...\n"); - - if (verbose) - printf("Input data is rank %d, size %dx%dx%d.\n", - rank, in_dims[0], in_dims[1], in_dims[2]); - - /* rotate vector field according to cart_map */ - if (verbose) - printf("Rotating vectors by matrix [ %10f%10f%10f\n" - " %10f%10f%10f\n" - " %10f%10f%10f ]\n", - cart_map.c0.x, cart_map.c1.x, cart_map.c2.x, - cart_map.c0.y, cart_map.c1.y, cart_map.c2.y, - cart_map.c0.z, cart_map.c1.z, cart_map.c2.z); - N = in_dims[0] * in_dims[1] * in_dims[2]; - for (ri = 0; ri < 2; ++ri) - for (i = 0; i < N; ++i) { - vector3 v; - v.x = d_in[0][ri][i]; - v.y = d_in[1][ri][i]; - v.z = d_in[2][ri][i]; - v = matrix3x3_vector3_mult(cart_map, v); - d_in[0][ri][i] = v.x; - d_in[1][ri][i] = v.y; - d_in[2][ri][i] = v.z; - } - - if (resolution > 0) { - out_dims[0] = vector3_norm(Rout.c0) * resolution + 0.5; - out_dims[1] = vector3_norm(Rout.c1) * resolution + 0.5; - out_dims[2] = vector3_norm(Rout.c2) * resolution + 0.5; - } - else { - for (i = 0; i < 3; ++i) - out_dims[i] = in_dims[i] * multiply_size[i]; - } - for (i = rank; i < 3; ++i) - out_dims[i] = 1; - for (N = 1, i = 0; i < 3; ++i) - N *= (out_dims[i] = MAX2(out_dims[i], 1)); - - if (transpose) { - out_dims2[0] = out_dims[1]; - out_dims2[1] = out_dims[0]; - out_dims2[2] = out_dims[2]; - } - else { - out_dims2[0] = out_dims[0]; - out_dims2[1] = out_dims[1]; - out_dims2[2] = out_dims[2]; - } - - if (verbose) - printf("Output data %dx%dx%d.\n", - out_dims2[0], out_dims2[1], out_dims2[2]); - - for (dim = 0; dim < 3; ++dim) { - real *d_out_re, *d_out_im; - char nam[] = "x.r-new"; - - CHK_MALLOC(d_out_re, real, N); - CHK_MALLOC(d_out_im, real, N); - - map_data(d_in[dim][0], d_in[dim][1], in_dims, - d_out_re, d_out_im, out_dims, - coord_map, kvector, pick_nearest, transpose); - - for (i = 0; i < N; ++i) { /* multiply * scaleby */ - scalar_complex d; - CASSIGN_SCALAR(d, d_out_re[i], d_out_im[i]); - CASSIGN_MULT(d, scaleby, d); - d_out_re[i] = CSCALAR_RE(d); - d_out_im[i] = CSCALAR_IM(d); - } - - nam[0] = 'x' + dim; - if (out_file.id != in_file.id) - nam[3] = 0; - if (verbose) - printf("Writing dataset to %s...\n", nam); - data_id = matrixio_create_dataset(out_file, nam,"", rank, out_dims2); - matrixio_write_real_data(data_id, out_dims2, start,1, d_out_re); - matrixio_close_dataset(data_id); - - nam[2] = 'i'; - if (verbose) - printf("Writing dataset to %s...\n", nam); - data_id = matrixio_create_dataset(out_file, nam,"", rank, out_dims2); - matrixio_write_real_data(data_id, out_dims2, start,1, d_out_im); - matrixio_close_dataset(data_id); - - if (verbose) - printf("Successfully wrote out data.\n"); - } - - for (dim = 0; dim < 3; ++dim) - for (ri = 0; ri < 2; ++ri) - free(d_in[dim][ri]); - return; - - bad: - for (dim = 0; dim < 3; ++dim) - for (ri = 0; ri < 2; ++ri) - free(d_in[dim][ri]); - /* try individual datasets */ - for (dim = 0; dim < 3; ++dim) { - char namr[] = "x.r"; - char nami[] = "x.i"; - - namr[0] = 'x' + dim; - nami[0] = 'x' + dim; - handle_dataset(in_file, out_file, namr, nami, - Rout, coord_map, kvector, resolution, - scaleby, multiply_size, pick_nearest, transpose); - - namr[1] = 0; - handle_dataset(in_file, out_file, namr, NULL, - Rout, coord_map, kvector, resolution, - scaleby, multiply_size, pick_nearest, transpose); - } +void handle_cvector_dataset(matrixio_id in_file, matrixio_id out_file, matrix3x3 Rout, + matrix3x3 coord_map, matrix3x3 cart_map, real *kvector, + double resolution, scalar_complex scaleby, real multiply_size[3], + int pick_nearest, int transpose) { + real *d_in[3][2] = {{0, 0}, {0, 0}, {0, 0}}; + int in_dims[3] = {1, 1, 1}, out_dims[3] = {1, 1, 1}, out_dims2[3], rank = 3; + int i, N, dim, ri; + int start[3] = {0, 0, 0}; + matrixio_id data_id; + + for (dim = 0; dim < 3; ++dim) + for (ri = 0; ri < 2; ++ri) { + char nam[] = "x.r"; + int dims[3] = {1, 1, 1}, rnk = 3; + + nam[0] = 'x' + dim; + nam[2] = ri ? 'i' : 'r'; + d_in[dim][ri] = matrixio_read_real_data(in_file, nam, &rnk, dims, 0, 0, 0, NULL); + if (!d_in[dim][ri]) goto bad; + if (!dim && !ri) { + rank = rnk; + for (i = 0; i < 3; ++i) + in_dims[i] = dims[i]; + } + else { + if (rank != rnk || in_dims[0] != dims[0] || in_dims[1] != dims[1] || in_dims[2] != dims[2]) + goto bad; + } + } + + if (verbose) printf("Found complex vector dataset...\n"); + + if (verbose) + printf("Input data is rank %d, size %dx%dx%d.\n", rank, in_dims[0], in_dims[1], in_dims[2]); + + /* rotate vector field according to cart_map */ + if (verbose) + printf("Rotating vectors by matrix [ %10f%10f%10f\n" + " %10f%10f%10f\n" + " %10f%10f%10f ]\n", + cart_map.c0.x, cart_map.c1.x, cart_map.c2.x, cart_map.c0.y, cart_map.c1.y, cart_map.c2.y, + cart_map.c0.z, cart_map.c1.z, cart_map.c2.z); + N = in_dims[0] * in_dims[1] * in_dims[2]; + for (ri = 0; ri < 2; ++ri) + for (i = 0; i < N; ++i) { + vector3 v; + v.x = d_in[0][ri][i]; + v.y = d_in[1][ri][i]; + v.z = d_in[2][ri][i]; + v = matrix3x3_vector3_mult(cart_map, v); + d_in[0][ri][i] = v.x; + d_in[1][ri][i] = v.y; + d_in[2][ri][i] = v.z; + } + + if (resolution > 0) { + out_dims[0] = vector3_norm(Rout.c0) * resolution + 0.5; + out_dims[1] = vector3_norm(Rout.c1) * resolution + 0.5; + out_dims[2] = vector3_norm(Rout.c2) * resolution + 0.5; + } + else { + for (i = 0; i < 3; ++i) + out_dims[i] = in_dims[i] * multiply_size[i]; + } + for (i = rank; i < 3; ++i) + out_dims[i] = 1; + for (N = 1, i = 0; i < 3; ++i) + N *= (out_dims[i] = MAX2(out_dims[i], 1)); + + if (transpose) { + out_dims2[0] = out_dims[1]; + out_dims2[1] = out_dims[0]; + out_dims2[2] = out_dims[2]; + } + else { + out_dims2[0] = out_dims[0]; + out_dims2[1] = out_dims[1]; + out_dims2[2] = out_dims[2]; + } + + if (verbose) printf("Output data %dx%dx%d.\n", out_dims2[0], out_dims2[1], out_dims2[2]); + + for (dim = 0; dim < 3; ++dim) { + real *d_out_re, *d_out_im; + char nam[] = "x.r-new"; + + CHK_MALLOC(d_out_re, real, N); + CHK_MALLOC(d_out_im, real, N); + + map_data(d_in[dim][0], d_in[dim][1], in_dims, d_out_re, d_out_im, out_dims, coord_map, kvector, + pick_nearest, transpose); + + for (i = 0; i < N; ++i) { /* multiply * scaleby */ + scalar_complex d; + CASSIGN_SCALAR(d, d_out_re[i], d_out_im[i]); + CASSIGN_MULT(d, scaleby, d); + d_out_re[i] = CSCALAR_RE(d); + d_out_im[i] = CSCALAR_IM(d); + } + + nam[0] = 'x' + dim; + if (out_file.id != in_file.id) nam[3] = 0; + if (verbose) printf("Writing dataset to %s...\n", nam); + data_id = matrixio_create_dataset(out_file, nam, "", rank, out_dims2); + matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_re); + matrixio_close_dataset(data_id); + + nam[2] = 'i'; + if (verbose) printf("Writing dataset to %s...\n", nam); + data_id = matrixio_create_dataset(out_file, nam, "", rank, out_dims2); + matrixio_write_real_data(data_id, out_dims2, start, 1, d_out_im); + matrixio_close_dataset(data_id); + + if (verbose) printf("Successfully wrote out data.\n"); + } + + for (dim = 0; dim < 3; ++dim) + for (ri = 0; ri < 2; ++ri) + free(d_in[dim][ri]); + return; + +bad: + for (dim = 0; dim < 3; ++dim) + for (ri = 0; ri < 2; ++ri) + free(d_in[dim][ri]); + /* try individual datasets */ + for (dim = 0; dim < 3; ++dim) { + char namr[] = "x.r"; + char nami[] = "x.i"; + + namr[0] = 'x' + dim; + nami[0] = 'x' + dim; + handle_dataset(in_file, out_file, namr, nami, Rout, coord_map, kvector, resolution, scaleby, + multiply_size, pick_nearest, transpose); + + namr[1] = 0; + handle_dataset(in_file, out_file, namr, NULL, Rout, coord_map, kvector, resolution, scaleby, + multiply_size, pick_nearest, transpose); + } } -void handle_file(const char *fname, const char *out_fname, - const char *data_name, - int rectify, int have_ve, vector3 ve, double resolution, - scalar_complex scaleby, real multiply_size[3], - int pick_nearest, int transpose) -{ - matrixio_id in_file, out_file; - real *R, *kvector, *copies; - int dims[2], rank; - matrix3x3 Rin = {{1,0,0},{0,1,0},{0,0,1}}, Rout, coord_map; - matrix3x3 cart_map = {{1,0,0},{0,1,0},{0,0,1}}; +void handle_file(const char *fname, const char *out_fname, const char *data_name, int rectify, + int have_ve, vector3 ve, double resolution, scalar_complex scaleby, + real multiply_size[3], int pick_nearest, int transpose) { + matrixio_id in_file, out_file; + real *R, *kvector, *copies; + int dims[2], rank; + matrix3x3 Rin = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}, Rout, coord_map; + matrix3x3 cart_map = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}; #define NUM_DATANAMES 13 - char datanames[NUM_DATANAMES][30] = { - "data", - "epsilon.xx", - "epsilon.xy", - "epsilon.xz", - "epsilon.yy", - "epsilon.yz", - "epsilon.zz", - "epsilon_inverse.xx", - "epsilon_inverse.xy", - "epsilon_inverse.xz", - "epsilon_inverse.yy", - "epsilon_inverse.yz", - "epsilon_inverse.zz" - }; - int i; - - if (verbose) - printf("Reading file %s...\n", fname); - in_file = matrixio_open(fname, out_fname != NULL); - - if (data_name && !data_name[0]) - data_name = NULL; - - R = matrixio_read_data_attr(in_file, "lattice vectors", - &rank, 2, dims); - if (R && rank == 2 && dims[0] == 3 && dims[1] == 3) { - Rin.c0.x = R[0*3+0]; Rin.c0.y = R[0*3+1]; Rin.c0.z = R[0*3+2]; - Rin.c1.x = R[1*3+0]; Rin.c1.y = R[1*3+1]; Rin.c1.z = R[1*3+2]; - Rin.c2.x = R[2*3+0]; Rin.c2.y = R[2*3+1]; Rin.c2.z = R[2*3+2]; - if (verbose) - printf("Read lattice vectors.\n"); - } - free(R); - - kvector = matrixio_read_data_attr(in_file, "Bloch wavevector", - &rank, 1, dims); - if (rank != 1 || dims[0] != 3) { - free(kvector); - kvector = NULL; - } - else if (verbose) - printf("Read Bloch wavevector (%g, %g, %g)\n", - kvector[0], kvector[1], kvector[2]); - - copies = matrixio_read_data_attr(in_file, "lattice copies", - &rank, 1, dims); - if (copies && rank == 1 && dims[0] == 3) { - Rin.c0 = vector3_scale(copies[0], Rin.c0); - Rin.c1 = vector3_scale(copies[1], Rin.c1); - Rin.c2 = vector3_scale(copies[2], Rin.c2); - if (kvector) { - kvector[0] *= copies[0]; - kvector[1] *= copies[1]; - kvector[2] *= copies[2]; - } - if (verbose) - printf("Read lattice copies (%g, %g, %g)\n", - copies[0], copies[1], copies[2]); - } - free(copies); - - if (verbose) - printf("Input lattice = (%g,%g,%g), (%g,%g,%g), (%g,%g,%g)\n", - Rin.c0.x, Rin.c0.y, Rin.c0.z, - Rin.c1.x, Rin.c1.y, Rin.c1.z, - Rin.c2.x, Rin.c2.y, Rin.c2.z); - - Rout = Rin; - - if (rectify) { - double V; - - /* Orthogonalize the output lattice vectors. If have_ve - is true, then the first new lattice vector should be in - the direction of the ve unit vector; otherwise, the first - new lattice vector is the first original lattice vector. - Note that we do this in such a way as to preserve the - volume of the unit cell, and so that our first vector - (in the direction of ve) smoothly interpolates between - the original lattice vectors. */ - - if (have_ve) - ve = unit_vector3(ve); - else - ve = unit_vector3(Rout.c0); - - /* First, compute c0 in the direction of ve by smoothly - interpolating the old c0/c1/c2 (formula is slightly tricky): */ - V = vector3_dot(vector3_cross(Rout.c0, Rout.c1), Rout.c2); - Rout.c1 = vector3_minus(Rout.c1,Rout.c0); - Rout.c2 = vector3_minus(Rout.c2,Rout.c0); - Rout.c0 = vector3_scale(V / vector3_dot(vector3_cross(Rout.c1, - Rout.c2), - ve), - ve); - - /* Now, orthogonalize c1 and c2: */ - Rout.c1 = vector3_minus(Rout.c1, - vector3_scale(vector3_dot(ve, Rout.c1), ve)); - Rout.c2 = vector3_minus(Rout.c2, - vector3_scale(vector3_dot(ve, Rout.c2), ve)); - Rout.c2 = vector3_minus(Rout.c2, - vector3_scale(vector3_dot(Rout.c1, Rout.c2) / - vector3_dot(Rout.c1, Rout.c1), - Rout.c1)); - - cart_map.c0 = unit_vector3(Rout.c0); - cart_map.c1 = unit_vector3(Rout.c1); - cart_map.c2 = unit_vector3(Rout.c2); - cart_map = matrix3x3_inverse(cart_map); - } - - if (transpose) { /* swap first two rows of cart_map */ - vector3 v; - cart_map = matrix3x3_transpose(cart_map); - v = cart_map.c0; - cart_map.c0 = cart_map.c1; - cart_map.c1 = v; - cart_map = matrix3x3_transpose(cart_map); - } - - Rout.c0 = vector3_scale(multiply_size[0], Rout.c0); - Rout.c1 = vector3_scale(multiply_size[1], Rout.c1); - Rout.c2 = vector3_scale(multiply_size[2], Rout.c2); - - if (verbose) - printf("Output lattice = (%g,%g,%g), (%g,%g,%g), (%g,%g,%g)\n", - Rout.c0.x, Rout.c0.y, Rout.c0.z, - Rout.c1.x, Rout.c1.y, Rout.c1.z, - Rout.c2.x, Rout.c2.y, Rout.c2.z); - - coord_map = matrix3x3_mult(matrix3x3_inverse(Rin), Rout); - - if (out_fname) { - if (verbose) - printf("Creating output file %s...\n", out_fname); - out_file = matrixio_create(out_fname); - } - else { - if (verbose) - printf("Writing output datasets to input file %s...\n", fname); - out_file = in_file; - } - - for (i = 0; i < NUM_DATANAMES; ++i) { - const char *dname = datanames[i]; - char name_re[300], name_im[300]; - - if (data_name) - dname = data_name; - - strcpy(name_re, dname); - handle_dataset(in_file, out_file, name_re, NULL, - Rout, coord_map, kvector, resolution, - scaleby, multiply_size, pick_nearest, transpose); - - sprintf(name_re, "%s.r", dname); - sprintf(name_im, "%s.i", dname); - handle_dataset(in_file, out_file, name_re, name_im, - Rout, coord_map, kvector, resolution, - scaleby, multiply_size, pick_nearest, transpose); - - if (data_name) - break; - } - - /* handle complex vector fields x.{ri}, y.{ri}, z.{ri} */ - handle_cvector_dataset(in_file, out_file, - Rout, coord_map, cart_map, kvector, resolution, - scaleby, multiply_size, pick_nearest, transpose); - - free(kvector); - - matrixio_close(in_file); - if (out_file.id != in_file.id) - matrixio_close(out_file); + char datanames[NUM_DATANAMES][30] = {"data", + "epsilon.xx", + "epsilon.xy", + "epsilon.xz", + "epsilon.yy", + "epsilon.yz", + "epsilon.zz", + "epsilon_inverse.xx", + "epsilon_inverse.xy", + "epsilon_inverse.xz", + "epsilon_inverse.yy", + "epsilon_inverse.yz", + "epsilon_inverse.zz"}; + int i; + + if (verbose) printf("Reading file %s...\n", fname); + in_file = matrixio_open(fname, out_fname != NULL); + + if (data_name && !data_name[0]) data_name = NULL; + + R = matrixio_read_data_attr(in_file, "lattice vectors", &rank, 2, dims); + if (R && rank == 2 && dims[0] == 3 && dims[1] == 3) { + Rin.c0.x = R[0 * 3 + 0]; + Rin.c0.y = R[0 * 3 + 1]; + Rin.c0.z = R[0 * 3 + 2]; + Rin.c1.x = R[1 * 3 + 0]; + Rin.c1.y = R[1 * 3 + 1]; + Rin.c1.z = R[1 * 3 + 2]; + Rin.c2.x = R[2 * 3 + 0]; + Rin.c2.y = R[2 * 3 + 1]; + Rin.c2.z = R[2 * 3 + 2]; + if (verbose) printf("Read lattice vectors.\n"); + } + free(R); + + kvector = matrixio_read_data_attr(in_file, "Bloch wavevector", &rank, 1, dims); + if (rank != 1 || dims[0] != 3) { + free(kvector); + kvector = NULL; + } + else if (verbose) + printf("Read Bloch wavevector (%g, %g, %g)\n", kvector[0], kvector[1], kvector[2]); + + copies = matrixio_read_data_attr(in_file, "lattice copies", &rank, 1, dims); + if (copies && rank == 1 && dims[0] == 3) { + Rin.c0 = vector3_scale(copies[0], Rin.c0); + Rin.c1 = vector3_scale(copies[1], Rin.c1); + Rin.c2 = vector3_scale(copies[2], Rin.c2); + if (kvector) { + kvector[0] *= copies[0]; + kvector[1] *= copies[1]; + kvector[2] *= copies[2]; + } + if (verbose) printf("Read lattice copies (%g, %g, %g)\n", copies[0], copies[1], copies[2]); + } + free(copies); + + if (verbose) + printf("Input lattice = (%g,%g,%g), (%g,%g,%g), (%g,%g,%g)\n", Rin.c0.x, Rin.c0.y, Rin.c0.z, + Rin.c1.x, Rin.c1.y, Rin.c1.z, Rin.c2.x, Rin.c2.y, Rin.c2.z); + + Rout = Rin; + + if (rectify) { + double V; + + /* Orthogonalize the output lattice vectors. If have_ve + is true, then the first new lattice vector should be in + the direction of the ve unit vector; otherwise, the first + new lattice vector is the first original lattice vector. + Note that we do this in such a way as to preserve the + volume of the unit cell, and so that our first vector + (in the direction of ve) smoothly interpolates between + the original lattice vectors. */ + + if (have_ve) + ve = unit_vector3(ve); + else + ve = unit_vector3(Rout.c0); + + /* First, compute c0 in the direction of ve by smoothly + interpolating the old c0/c1/c2 (formula is slightly tricky): */ + V = vector3_dot(vector3_cross(Rout.c0, Rout.c1), Rout.c2); + Rout.c1 = vector3_minus(Rout.c1, Rout.c0); + Rout.c2 = vector3_minus(Rout.c2, Rout.c0); + Rout.c0 = vector3_scale(V / vector3_dot(vector3_cross(Rout.c1, Rout.c2), ve), ve); + + /* Now, orthogonalize c1 and c2: */ + Rout.c1 = vector3_minus(Rout.c1, vector3_scale(vector3_dot(ve, Rout.c1), ve)); + Rout.c2 = vector3_minus(Rout.c2, vector3_scale(vector3_dot(ve, Rout.c2), ve)); + Rout.c2 = vector3_minus( + Rout.c2, + vector3_scale(vector3_dot(Rout.c1, Rout.c2) / vector3_dot(Rout.c1, Rout.c1), Rout.c1)); + + cart_map.c0 = unit_vector3(Rout.c0); + cart_map.c1 = unit_vector3(Rout.c1); + cart_map.c2 = unit_vector3(Rout.c2); + cart_map = matrix3x3_inverse(cart_map); + } + + if (transpose) { /* swap first two rows of cart_map */ + vector3 v; + cart_map = matrix3x3_transpose(cart_map); + v = cart_map.c0; + cart_map.c0 = cart_map.c1; + cart_map.c1 = v; + cart_map = matrix3x3_transpose(cart_map); + } + + Rout.c0 = vector3_scale(multiply_size[0], Rout.c0); + Rout.c1 = vector3_scale(multiply_size[1], Rout.c1); + Rout.c2 = vector3_scale(multiply_size[2], Rout.c2); + + if (verbose) + printf("Output lattice = (%g,%g,%g), (%g,%g,%g), (%g,%g,%g)\n", Rout.c0.x, Rout.c0.y, Rout.c0.z, + Rout.c1.x, Rout.c1.y, Rout.c1.z, Rout.c2.x, Rout.c2.y, Rout.c2.z); + + coord_map = matrix3x3_mult(matrix3x3_inverse(Rin), Rout); + + if (out_fname) { + if (verbose) printf("Creating output file %s...\n", out_fname); + out_file = matrixio_create(out_fname); + } + else { + if (verbose) printf("Writing output datasets to input file %s...\n", fname); + out_file = in_file; + } + + for (i = 0; i < NUM_DATANAMES; ++i) { + const char *dname = datanames[i]; + char name_re[300], name_im[300]; + + if (data_name) dname = data_name; + + strcpy(name_re, dname); + handle_dataset(in_file, out_file, name_re, NULL, Rout, coord_map, kvector, resolution, scaleby, + multiply_size, pick_nearest, transpose); + + sprintf(name_re, "%s.r", dname); + sprintf(name_im, "%s.i", dname); + handle_dataset(in_file, out_file, name_re, name_im, Rout, coord_map, kvector, resolution, + scaleby, multiply_size, pick_nearest, transpose); + + if (data_name) break; + } + + /* handle complex vector fields x.{ri}, y.{ri}, z.{ri} */ + handle_cvector_dataset(in_file, out_file, Rout, coord_map, cart_map, kvector, resolution, scaleby, + multiply_size, pick_nearest, transpose); + + free(kvector); + + matrixio_close(in_file); + if (out_file.id != in_file.id) matrixio_close(out_file); } -void usage(FILE *f) -{ - fprintf(f, "Usage: mpb-data [options] []\n" - "Options:\n" +void usage(FILE *f) { + fprintf(f, "Usage: mpb-data [options] []\n" + "Options:\n" " -h : this help message\n" " -V : print version number and copyright\n" " -v : verbose output\n" - " -o : output to (first input file only)\n" - " -r : output rectangular cell\n" - " -e : as -r, but first axis of cell is along \n" - " -n : output resolution of n grid points per a\n" - " -x \n" - " -y \n" - " -z : output mx/my/mz periods in the x/y/z directions\n" - " -P : multiply phase shift of degrees\n" - " -m : same as -x -y -z \n" - " -T : transpose first two dimensions (x & y) of data\n" - " -p : pixellized output (no grid interpolation)\n" - " -d : use dataset in the input files (default: all mpb datasets)\n" - " -- you can also specify a dataset via :\n" - ); + " -o : output to (first input file only)\n" + " -r : output rectangular cell\n" + " -e : as -r, but first axis of cell is along \n" + " -n : output resolution of n grid points per a\n" + " -x \n" + " -y \n" + " -z : output mx/my/mz periods in the x/y/z directions\n" + " -P : multiply phase shift of degrees\n" + " -m : same as -x -y -z \n" + " -T : transpose first two dimensions (x & y) of data\n" + " -p : pixellized output (no grid interpolation)\n" + " -d : use dataset in the input files (default: all mpb datasets)\n" + " -- you can also specify a dataset via :\n"); } /* given an fname of the form :, return a pointer to a newly-allocated string containing , and point data_name to the position of in fname. The user must free() the string. */ -static char *split_fname(char *fname, char **data_name) -{ - int fname_len; - char *colon, *filename; - - fname_len = strlen(fname); - colon = strchr(fname, ':'); - if (colon) { - int colon_len = strlen(colon); - filename = (char*) malloc(sizeof(char) * (fname_len-colon_len+1)); - CHECK(filename, "out of memory"); - strncpy(filename, fname, fname_len-colon_len+1); - filename[fname_len-colon_len] = 0; - *data_name = colon + 1; - } -else { /* treat as if ":" were at the end of fname */ - filename = (char*) malloc(sizeof(char) * (fname_len + 1)); - CHECK(filename, "out of memory"); - strcpy(filename, fname); - *data_name = fname + fname_len; - } - return filename; +static char *split_fname(char *fname, char **data_name) { + int fname_len; + char *colon, *filename; + + fname_len = strlen(fname); + colon = strchr(fname, ':'); + if (colon) { + int colon_len = strlen(colon); + filename = (char *)malloc(sizeof(char) * (fname_len - colon_len + 1)); + CHECK(filename, "out of memory"); + strncpy(filename, fname, fname_len - colon_len + 1); + filename[fname_len - colon_len] = 0; + *data_name = colon + 1; + } + else { /* treat as if ":" were at the end of fname */ + filename = (char *)malloc(sizeof(char) * (fname_len + 1)); + CHECK(filename, "out of memory"); + strcpy(filename, fname); + *data_name = fname + fname_len; + } + return filename; } -int main(int argc, char **argv) -{ - char *out_fname = NULL, *data_name = NULL; - int rectify = 0, have_ve = 0; - double phaseangle = 0; - double resolution = 0; - vector3 ve = {1,0,0}; - real multiply_size[3] = {1,1,1}; - int pick_nearest = 0, transpose = 0; - int ifile, c; - extern char *optarg; - extern int optind; - scalar_complex scaleby = {1,0}, phase; - - while ((c = getopt(argc, argv, "hVvo:x:y:z:m:d:n:prTe:P:")) != -1) - switch (c) { - case 'h': - usage(stdout); - return EXIT_SUCCESS; - case 'V': - printf("mpb-data " PACKAGE_VERSION " by Steven G. Johnson.\n" -"Copyright (C) 1999-2014 Massachusetts Institute of Technology.\n" -"This is free software, and you are welcome to redistribute it under the\n" -"terms of the GNU General Public License (GPL). mpb-data comes with\n" -"ABSOLUTELY NO WARRANTY; see the GPL for more details.\n"); - return EXIT_SUCCESS; - case 'v': - verbose = 1; - break; - case 'o': - free(out_fname); - out_fname = (char*) malloc(sizeof(char) * - (strlen(optarg) + 1)); - CHECK(out_fname, "out of memory"); - strcpy(out_fname, optarg); - break; - case 'd': - free(data_name); - data_name = (char*) malloc(sizeof(char) * - (strlen(optarg) + 1)); - CHECK(data_name, "out of memory"); - strcpy(data_name, optarg); - break; - case 'x': - multiply_size[0] = atof(optarg); - break; - case 'y': - multiply_size[1] = atof(optarg); - break; - case 'z': - multiply_size[2] = atof(optarg); - break; - case 'm': - multiply_size[0] = atof(optarg); - multiply_size[1] = atof(optarg); - multiply_size[2] = atof(optarg); - break; - case 'n': - resolution = atof(optarg); - CHECK(resolution > 0, - "invalid resolution for -n (must be positive)"); - break; - case 'P': - phaseangle = atof(optarg); - break; - case 'p': - pick_nearest = 1; - break; - case 'T': - transpose = 1; - break; - case 'e': - have_ve = 1; - if (3 != sscanf(optarg, "%lf,%lf,%lf", - &ve.x, &ve.y, &ve.z)) { - fprintf(stderr, - "Invalid -e argument \"%s\"\n", optarg); - usage(stderr); - return EXIT_FAILURE; - } - rectify = 1; - break; - case 'r': - rectify = 1; - break; - default: - fprintf(stderr, "Invalid argument -%c\n", c); - usage(stderr); - return EXIT_FAILURE; - } - if (optind == argc) { /* no parameters left */ +int main(int argc, char **argv) { + char *out_fname = NULL, *data_name = NULL; + int rectify = 0, have_ve = 0; + double phaseangle = 0; + double resolution = 0; + vector3 ve = {1, 0, 0}; + real multiply_size[3] = {1, 1, 1}; + int pick_nearest = 0, transpose = 0; + int ifile, c; + extern char *optarg; + extern int optind; + scalar_complex scaleby = {1, 0}, phase; + + while ((c = getopt(argc, argv, "hVvo:x:y:z:m:d:n:prTe:P:")) != -1) + switch (c) { + case 'h': usage(stdout); return EXIT_SUCCESS; + case 'V': + printf("mpb-data " PACKAGE_VERSION " by Steven G. Johnson.\n" + "Copyright (C) 1999-2014 Massachusetts Institute of Technology.\n" + "This is free software, and you are welcome to redistribute it under the\n" + "terms of the GNU General Public License (GPL). mpb-data comes with\n" + "ABSOLUTELY NO WARRANTY; see the GPL for more details.\n"); + return EXIT_SUCCESS; + case 'v': verbose = 1; break; + case 'o': + free(out_fname); + out_fname = (char *)malloc(sizeof(char) * (strlen(optarg) + 1)); + CHECK(out_fname, "out of memory"); + strcpy(out_fname, optarg); + break; + case 'd': + free(data_name); + data_name = (char *)malloc(sizeof(char) * (strlen(optarg) + 1)); + CHECK(data_name, "out of memory"); + strcpy(data_name, optarg); + break; + case 'x': multiply_size[0] = atof(optarg); break; + case 'y': multiply_size[1] = atof(optarg); break; + case 'z': multiply_size[2] = atof(optarg); break; + case 'm': + multiply_size[0] = atof(optarg); + multiply_size[1] = atof(optarg); + multiply_size[2] = atof(optarg); + break; + case 'n': + resolution = atof(optarg); + CHECK(resolution > 0, "invalid resolution for -n (must be positive)"); + break; + case 'P': phaseangle = atof(optarg); break; + case 'p': pick_nearest = 1; break; + case 'T': transpose = 1; break; + case 'e': + have_ve = 1; + if (3 != sscanf(optarg, "%lf,%lf,%lf", &ve.x, &ve.y, &ve.z)) { + fprintf(stderr, "Invalid -e argument \"%s\"\n", optarg); usage(stderr); return EXIT_FAILURE; - } - - CASSIGN_SCALAR(phase, - cos(TWOPI * phaseangle / 360.0), - sin(TWOPI * phaseangle / 360.0)); - CASSIGN_MULT(scaleby, scaleby, phase); - - for (ifile = optind; ifile < argc; ++ifile) { - char *dname, *h5_fname; - h5_fname = split_fname(argv[ifile], &dname); - if (!dname[0]) - dname = data_name; - - handle_file(h5_fname, out_fname, dname, - rectify, have_ve, ve, resolution, - scaleby, multiply_size, pick_nearest, transpose); - - if (out_fname) - free(out_fname); - out_fname = NULL; - free(h5_fname); - } - free(data_name); - - return EXIT_SUCCESS; + } + rectify = 1; + break; + case 'r': rectify = 1; break; + default: + fprintf(stderr, "Invalid argument -%c\n", c); + usage(stderr); + return EXIT_FAILURE; + } + if (optind == argc) { /* no parameters left */ + usage(stderr); + return EXIT_FAILURE; + } + + CASSIGN_SCALAR(phase, cos(TWOPI * phaseangle / 360.0), sin(TWOPI * phaseangle / 360.0)); + CASSIGN_MULT(scaleby, scaleby, phase); + + for (ifile = optind; ifile < argc; ++ifile) { + char *dname, *h5_fname; + h5_fname = split_fname(argv[ifile], &dname); + if (!dname[0]) dname = data_name; + + handle_file(h5_fname, out_fname, dname, rectify, have_ve, ve, resolution, scaleby, + multiply_size, pick_nearest, transpose); + + if (out_fname) free(out_fname); + out_fname = NULL; + free(h5_fname); + } + free(data_name); + + return EXIT_SUCCESS; }