Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support photon sharing on the GPU, fix #86 #99

Merged
merged 4 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mmclab/example/demo_photon_sharing.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
cfg.tstart = 0;
cfg.tend = 5e-9;
cfg.tstep = 1e-10;
cfg.gpuid = -1;
cfg.gpuid = 1;
cfg.method = 'elem';
cfg.debuglevel = 'TP';

Expand All @@ -30,7 +30,7 @@
pat(16:25, :, 5) = 1;
pat(:, 16:25, 5) = 1; % pattern with a bright cross

cfg.srcpattern = pat;
cfg.srcpattern = permute(pat, [3 1 2]);

%% run the simulation

Expand Down
14 changes: 1 addition & 13 deletions mmclab/mmc2json.m
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
% mcxpreview supports the cfg input for both mcxlab and mmclab.
% filestub: the filestub is the name stub for all output files,including
% filestub.json: the JSON input file
% filestub_vol.bin: the volume file if cfg.vol is defined
% filestub_shapes.json: the domain shape file if cfg.shapes is defined
% filestub_pattern.bin: the domain shape file if cfg.pattern is defined
%
% if filestub ends with '.json', then mmc2json saves the mesh data
% in the single-file mode, and the mesh information is stored
Expand Down Expand Up @@ -58,16 +55,7 @@
end
end
if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern))
Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1);
Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2);
Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3);
Optode.Source.Pattern.Data = single(cfg.srcpattern);
if (~singlefile)
Optode.Source.Pattern.Data = [filestub '_pattern.bin'];
fid = fopen(Optode.Source.Pattern.Data, 'wb');
fwrite(fid, cfg.srcpattern, 'float32');
fclose(fid);
end
Optode.Source.Pattern = single(cfg.srcpattern);
end

%% define the domain and optical properties
Expand Down
4 changes: 2 additions & 2 deletions mmclab/mmclab.m
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@
% by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3)
% 'pattern' - a 3D quadrilateral pattern illumination, same as above, except
% srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions,
% and srcpattern is a floating-point pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a floating-point array with
% and srcpattern is a double-precision pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with
% a dimension of [srcnum srcparam1(4) srcparam2(4)]
% Example: <demo_photon_sharing.m>
% 'fourier' - spatial frequency domain source, similar to 'planar', except
Expand Down
98 changes: 69 additions & 29 deletions src/mmc_cl_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/
cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/

cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; /**< total output data length in float count per time-frame */
cfg->crop0.w = meshlen * cfg->maxgate; /**< total output data length, before double-buffer expansion */

cl_float* field, *dref = NULL;

Expand All @@ -103,8 +103,11 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
char opt[MAX_PATH_LENGTH + 1] = {'\0'};
cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1;
cl_uint hostdetreclen = detreclen + 1;
int sharedmemsize = 0;

GPUInfo* gpu = NULL;
float4* propdet;
double energytot = 0.0, energyesc = 0.0;

MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}},
cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit,
Expand All @@ -122,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
(MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
};
Expand All @@ -138,6 +141,18 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__);
}

if (cfg->issavedet) {
sharedmemsize = sizeof(cl_float) * detreclen;
}

param.reclen = sharedmemsize / sizeof(float);

sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1); /**< store energyesc/energytot */

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sharedmemsize += sizeof(cl_float) * cfg->srcnum;
}

cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

/* Use NULL for backward compatibility */
Expand Down Expand Up @@ -244,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN));
}

fieldlen = meshlen * cfg->maxgate;
fieldlen = cfg->crop0.w; /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */

if (cfg->seed > 0) {
srand(cfg->seed);
Expand Down Expand Up @@ -298,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), &param, &status), status)));

Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN);
energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1);
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1);

for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) {
Pseed[j] = rand();
Expand All @@ -319,14 +334,14 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status)));
}

OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status)));
OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status)));
OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status)));
OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));

if (cfg->srctype == MCX_SRC_PATTERN) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status)));
} else if (cfg->srctype == MCX_SRC_PATTERN3D) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status)));
} else {
gsrcpattern[i] = NULL;
}
Expand Down Expand Up @@ -417,6 +432,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL");
}

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING");
}

if (gpu[0].vendor == dvNVIDIA) {
sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU");
}
Expand Down Expand Up @@ -493,11 +512,13 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons,
cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin);

MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock);

OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
//OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autoblock, NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i))));
Expand Down Expand Up @@ -540,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug));
}

cfg->energytot = 0.f;
cfg->energyesc = 0.f;
cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
energytot = 0.0;
energyesc = 0.0;
cfg->runtime = 0;

//simulate for all time-gates in maxgate groups per run
Expand Down Expand Up @@ -620,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
reporter.raytet += rep.raytet;
reporter.jumpdebug += rep.jumpdebug;

energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1),
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum,
energy, 0, NULL, NULL)));

for (i = 0; i < gpu[devid].autothread; i++) {
cfg->energyesc += energy[(i << 1)];
cfg->energytot += energy[(i << 1) + 1];
for (j = 0; j < cfg->srcnum; j++) {
cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
energyesc += energy[(i << 1) * cfg->srcnum + j];
energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
}
}

free(energy);
Expand Down Expand Up @@ -691,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen,
rawdref, 0, NULL, NULL)));

//TODO: saving dref has not yet adopting double-buffer
for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU
dref[i] += rawdref[i]; //+rawfield[i+fieldlen];
}
Expand All @@ -708,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify
mcx_fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
field[i] += rawfield[i] + rawfield[i + fieldlen];
}

free(rawfield);
Expand All @@ -732,24 +760,26 @@ is more than what your have specified (%d), please use the -H option to specify
}// iteration
}// time gates

fieldlen = (fieldlen >> cfg->nbuffer);
field = realloc(field, sizeof(field[0]) * fieldlen);

if (cfg->exportfield) {
if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
for (i = 0; i < fieldlen; i++) {
cfg->exportfield[i] += field[i];
}
} else {
for (i = 0; i < cfg->maxgate; i++)
int srcid;

for (i = 0; i < cfg->maxgate; i++) {
for (j = 0; j < mesh->ne; j++) {
float ww = field[i * mesh->ne + j] * 0.25f;
int k;
for (srcid = 0; srcid < cfg->srcnum; srcid++) {
float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
int k;

for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww;
for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
}
}
}
}
}
}

Expand All @@ -760,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify
}

if (cfg->isnormalized) {
MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
mcx_fflush(cfg->flog);
double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;

for (j = 0; j < cfg->srcnum; j++) {
energyabs = cfg->energytot[j] - cfg->energyesc[j];
cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
sum_normalizer += cur_normalizer;
MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n",
j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
}

cfg->energyabs = cfg->energytot - cfg->energyesc;
mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
}

#ifndef MCX_CONTAINER
Expand Down Expand Up @@ -804,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify
MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n",
cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc);
MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n",
cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
energytot, (energytot - energyesc) / energytot * 100.f);
mcx_fflush(cfg->flog);

OCL_ASSERT(clReleaseMemObject(gprogress[0]));
Expand Down Expand Up @@ -886,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify
free(mcxkernel);

free(waittoread);
free(cfg->energytot);
free(cfg->energyesc);
cfg->energytot = NULL;
cfg->energyesc = NULL;

if (gpu) {
free(gpu);
Expand Down
1 change: 0 additions & 1 deletion src/mmc_cl_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig {
cl_int e0;
cl_int isextdet;
cl_int framelen;
cl_uint nbuffer;
cl_uint maxpropdet;
cl_uint normbuf;
cl_int issaveseed;
Expand Down
Loading
Loading