From 749f0eb180e0923a2a8480ef11a949801565c00d Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Mon, 23 Sep 2024 22:46:28 -0400 Subject: [PATCH 1/4] [feat] implement photon sharing on GPU, not working, stalls, #72 --- mmclab/example/demo_photon_sharing.m | 4 +- mmclab/mmclab.m | 4 +- src/mmc_cl_host.c | 31 ++- src/mmc_cl_host.h | 1 - src/mmc_core.cl | 278 ++++++++++++++++++--------- src/mmc_cu_host.cu | 37 ++-- src/mmc_raytrace.c | 31 ++- src/mmc_utils.c | 3 - src/mmc_utils.h | 1 - src/mmclab.cpp | 22 +-- 10 files changed, 252 insertions(+), 160 deletions(-) diff --git a/mmclab/example/demo_photon_sharing.m b/mmclab/example/demo_photon_sharing.m index c8d5dc85..15c5960d 100644 --- a/mmclab/example/demo_photon_sharing.m +++ b/mmclab/example/demo_photon_sharing.m @@ -16,7 +16,7 @@ cfg.tstart = 0; cfg.tend = 5e-9; cfg.tstep = 1e-10; -cfg.gpuid = -1; +cfg.gpuid = 1; cfg.method = 'elem'; cfg.debuglevel = 'TP'; @@ -30,7 +30,7 @@ pat(16:25, :, 5) = 1; pat(:, 16:25, 5) = 1; % pattern with a bright cross -cfg.srcpattern = pat; +cfg.srcpattern = permute(pat, [3 1 2]); %% run the simulation diff --git a/mmclab/mmclab.m b/mmclab/mmclab.m index f6781e55..22c6538d 100644 --- a/mmclab/mmclab.m +++ b/mmclab/mmclab.m @@ -105,8 +105,8 @@ % by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3) % 'pattern' - a 3D quadrilateral pattern illumination, same as above, except % srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions, -% and srcpattern is a floating-point pattern array, with values between [0-1]. -% if cfg.srcnum>1, srcpattern must be a floating-point array with +% and srcpattern is a double-precision pattern array, with values between [0-1]. +% if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with % a dimension of [srcnum srcparam1(4) srcparam2(4)] % Example: % 'fourier' - spatial frequency domain source, similar to 'planar', except diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c index cc23c7fb..4879f263 100644 --- a/src/mmc_cl_host.c +++ b/src/mmc_cl_host.c @@ -92,7 +92,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/ cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/ - cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing + cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer cl_float* field, *dref = NULL; @@ -103,6 +103,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { char opt[MAX_PATH_LENGTH + 1] = {'\0'}; cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1; cl_uint hostdetreclen = detreclen + 1; + int sharedmemsize = 0; + GPUInfo* gpu = NULL; float4* propdet; @@ -122,7 +124,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout, cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}}, mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}}, - cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, + cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, cfg->seed, cfg->maxjumpdebug }; @@ -138,6 +140,16 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__); } + if (cfg->issavedet) { + sharedmemsize = sizeof(cl_float) * detreclen; + } + + param.reclen = sharedmemsize / sizeof(float); + + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sharedmemsize += sizeof(cl_float) * cfg->srcnum; + } + cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; /* Use NULL for backward compatibility */ @@ -324,9 +336,9 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status))); if (cfg->srctype == MCX_SRC_PATTERN) { - OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status))); + OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status))); } else if (cfg->srctype == MCX_SRC_PATTERN3D) { - OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status))); + OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status))); } else { gsrcpattern[i] = NULL; } @@ -417,6 +429,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL"); } + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING"); + } + if (gpu[0].vendor == dvNVIDIA) { sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU"); } @@ -497,7 +513,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons))); //OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i)))); - OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL))); + OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autothread, NULL))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i)))); @@ -708,7 +724,7 @@ is more than what your have specified (%d), please use the -H option to specify mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU - field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; } free(rawfield); @@ -732,9 +748,6 @@ is more than what your have specified (%d), please use the -H option to specify }// iteration }// time gates - fieldlen = (fieldlen >> cfg->nbuffer); - field = realloc(field, sizeof(field[0]) * fieldlen); - if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { for (i = 0; i < fieldlen; i++) { diff --git a/src/mmc_cl_host.h b/src/mmc_cl_host.h index 2736392f..e1367efb 100644 --- a/src/mmc_cl_host.h +++ b/src/mmc_cl_host.h @@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig { cl_int e0; cl_int isextdet; cl_int framelen; - cl_uint nbuffer; cl_uint maxpropdet; cl_uint normbuf; cl_int issaveseed; diff --git a/src/mmc_core.cl b/src/mmc_core.cl index ef8d6417..975c09ae 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -239,6 +239,7 @@ inline __device__ __host__ float4 convert_float4_rte(float4 v) { #define R_C0 3.335640951981520e-12f //1/C0 in s/mm #define JUST_ABOVE_ONE 1.0001f //test for boundary +#define JUST_BELOW_ONE 0.9998f /**< test for boundary */ #define SAME_VOXEL -9999.f //scatter within a voxel #define NO_LAUNCH 9999 //when fail to launch, for debug @@ -283,8 +284,8 @@ typedef struct MMC_Ray { uint oldidx; float oldweight; unsigned int posidx; /**< launch position index of the photon for pattern source type */ - //int nexteid; /**< the index to the neighboring tet to be moved into */ - //float4 bary0; /**< the Barycentric coordinate of the intersection with the tet */ + //int nexteid; /**< the index to the neighboring tet to be moved into */ + //float4 bary0; /**< the Barycentric coordinate of the intersection with the tet */ float slen0; /**< initial unitless scattering length = length*mus */ unsigned int photonid; /**< index of the current photon */ } ray __attribute__ ((aligned (16))); @@ -301,13 +302,13 @@ typedef struct MMC_Parameter { uint maxmedia; uint detnum; int voidtime; - int srctype; /**< type of the source */ - float4 srcparam1; /**< source parameters set 1 */ - float4 srcparam2; /**< source parameters set 2 */ - uint issaveref; /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/ + int srctype; /**< type of the source */ + float4 srcparam1; /**< source parameters set 1 */ + float4 srcparam2; /**< source parameters set 2 */ + uint issaveref; /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/ uint maxgate; - uint debuglevel; /**< debug flags */ - int reclen; /**< record (4-byte per record) number per detected photon */ + uint debuglevel; /**< debug flags */ + int reclen; /**< record (4-byte per record) number per detected photon, does not include srcnum buffer for photon-sharing */ int outputtype; int elemlen; int mcmethod; @@ -325,12 +326,11 @@ typedef struct MMC_Parameter { int e0; int isextdet; int framelen; - uint nbuffer; uint maxpropdet; uint normbuf; int issaveseed; int seed; - uint maxjumpdebug; /**< max number of positions to be saved to save photon trajectory when -D M is used */ + uint maxjumpdebug; /**< max number of positions to be saved to save photon trajectory when -D M is used */ } MCXParam __attribute__ ((aligned (16))); typedef struct MMC_Reporter { @@ -339,10 +339,10 @@ typedef struct MMC_Reporter { } MCXReporter __attribute__ ((aligned (16))); typedef struct MCX_medium { - float mua; /**x; - n_det[baseaddr++] = p0->y; - n_det[baseaddr++] = p0->z; - n_det[baseaddr++] = v->x; - n_det[baseaddr++] = v->y; - n_det[baseaddr++] = v->z; + n_det[baseaddr++] = r->p0.x; + n_det[baseaddr++] = r->p0.y; + n_det[baseaddr++] = r->p0.z; + n_det[baseaddr++] = r->v.x; + n_det[baseaddr++] = r->v.y; + n_det[baseaddr++] = r->v.z; } n_det[baseaddr++] = ppath[GPU_PARAM(gcfg, reclen) - 1]; // save partial pathlength to the memory @@ -600,7 +582,7 @@ __device__ void savedebugdata(ray* r, uint id, __global MCXReporter* reporter, _ * \param[out] visit: statistics counters of this thread */ -__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __global int* elem, __global float* weight, +__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __local float* ppath, __global int* elem, __global float* weight, int type, __global int* facenb, __global float4* normal, __constant Medium* gmed, __global float* replayweight, __global float* replaytime) { float Lmin; @@ -719,20 +701,41 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->oldweight > 0.f) { + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { #ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + float oldval = atomicadd(weight + r->oldidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx, oldval); + } else { + atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + } } - } #else - weight[r->oldidx] += r->oldweight; + weight[r->oldidx] += r->oldweight; +#endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } } #endif @@ -745,29 +748,54 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->faceid == -2 || !r->isend) { + + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + #ifdef USE_ATOMIC - float oldval = atomicadd(weight + newidx, r->oldweight); + float oldval = atomicadd(weight + newidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx, -oldval) < 0.0f) { + atomicadd(weight + newidx, oldval); + } else { + atomicadd(weight + newidx + gcfg->crop0.w, oldval); + } } - } #else - weight[newidx] += r->oldweight; + weight[newidx] += r->oldweight; +#endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } + r->oldweight = 0.f; } -#endif +#endif // for ifdef DO_NOT_SAVE #ifdef __NVCC__ } -#endif -#endif +#endif // for if (GPU_PARAM(gcfg, method) == rtBLBadouel +#endif // for ifdef USE_BLBADOUEL + #ifdef USE_DMMC #ifdef __NVCC__ @@ -801,21 +829,45 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ if (newidx != r->oldidx) { #ifndef DO_NOT_SAVE -#ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx, oldval); + } else { + atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + } } - } #else - weight[r->oldidx] += r->oldweight; + weight[r->oldidx] += r->oldweight; #endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } + +#endif // for ifdef DO_NOT_SAVE r->oldidx = newidx; r->oldweight = S.w * totalloss; } else { @@ -825,24 +877,49 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->faceid == -2 || !r->isend) { -#ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx, -oldval) < 0.0f) { + atomicadd(weight + newidx, oldval); + } else { + atomicadd(weight + newidx + gcfg->crop0.w, oldval); + } } - } #else - weight[newidx] += r->oldweight; + weight[newidx] += r->oldweight; #endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; +#endif + } + + } + r->oldweight = 0.f; } -#endif +#endif // for ifdef DO_NOT_SAVE + #ifndef __NVCC__ S.w *= T.w; S.xyz += T.xyz; @@ -855,8 +932,8 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ } #endif -#endif -#endif +#endif // for USE_DMMC +#endif // for MCX_SKIP_VOLUME } r->p0 = r->p0 + FL3(r->Lmove) * r->vec; @@ -1045,7 +1122,7 @@ __device__ void fixphoton(float3* p, __global float3* nodes, __global int* ee) { * \param[in,out] ran: the random number generator states */ -__device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) { +__device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) { int canfocus = 1; float3 origin = r->p0; @@ -1078,8 +1155,9 @@ __device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3* #endif int xsize = (int)gcfg->srcparam1.w; int ysize = (int)gcfg->srcparam2.w; - r->posidx = MIN((int)(ry * ysize), ysize - 1) * xsize + MIN((int)(rx * xsize), xsize - 1); - r->weight = srcpattern[MIN( (int)(ry * gcfg->srcparam2.w), (int)gcfg->srcparam2.w - 1 ) * (int)(gcfg->srcparam1.w) + MIN( (int)(rx * gcfg->srcparam1.w), (int)gcfg->srcparam1.w - 1 )]; + r->posidx = MIN((int)(ry * JUST_BELOW_ONE * ysize), ysize - 1) * xsize + MIN((int)(rx * JUST_BELOW_ONE * xsize), xsize - 1); + r->weight = (GPU_PARAM(gcfg, srcnum) > 1) ? 1.f : srcpattern[r->posidx]; + #endif #if defined(__NVCC__) || defined(MCX_SRC_FOURIER) // need to prevent rx/ry=1 here #ifdef __NVCC__ @@ -1385,17 +1463,28 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #endif /*initialize the photon parameters*/ - launchphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); + launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); *energytot += r.weight; #ifdef MCX_SAVE_DETECTORS if (GPU_PARAM(gcfg, issavedet)) { clearpath(ppath, GPU_PARAM(gcfg, reclen)); /*clear shared memory for saving history of a new photon*/ - ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ + + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + *((uint*)(ppath + GPU_PARAM(gcfg, reclen) - 1)) = r.posidx; + } } #endif + if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) { + for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) { + ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid]; + } + } + if (GPU_PARAM(gcfg, debuglevel) & dlTraj) { savedebugdata(&r, id, reporter, gdebugdata, gcfg); } @@ -1404,7 +1493,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP /*http://stackoverflow.com/questions/2148149/how-to-sum-a-large-number-of-float-number*/ while (1) { /*propagate a photon until exit*/ - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; if (r.pout.x == MMC_UNDEFINED) { @@ -1477,7 +1566,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP GPUDEBUG(("P %f %f %f %d %u %e\n", r.pout.x, r.pout.y, r.pout.z, r.eid, id, r.slen)); } - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; #ifdef MCX_SAVE_DETECTORS @@ -1495,7 +1584,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP while (r.pout.x == MMC_UNDEFINED && fixcount++ < MAX_TRIAL) { fixphoton(&r.p0, node, (__global int*)(elem + (r.eid - 1)*GPU_PARAM(gcfg, elemlen))); - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; #ifdef MCX_SAVE_DETECTORS @@ -1550,17 +1639,17 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #ifdef MCX_SAVE_SEED if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, initseed); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, initseed); } else { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, initseed); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, initseed); } #else if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, NULL); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, NULL); } else { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, NULL); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, NULL); } #endif @@ -1643,7 +1732,8 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j]; } - onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0)*GPU_PARAM(gcfg, reclen), gcfg, node, elem, + onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) * + (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet, srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata); } diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu index c4b7b7d1..e50a1e61 100644 --- a/src/mmc_cu_host.cu +++ b/src/mmc_cu_host.cu @@ -224,8 +224,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo float* greplayweight = NULL, *greplaytime = NULL; MCXReporter* greporter; - uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) - << cfg->nbuffer; // use 4 copies to reduce racing + uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer float* field, *dref = NULL; @@ -237,6 +236,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1; uint hostdetreclen = detreclen + 1; + // launch mcxkernel + size_t sharedmemsize = 0; MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z), make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z), @@ -280,7 +281,6 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->e0, cfg->isextdet, (int)meshlen, - cfg->nbuffer, (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, @@ -290,6 +290,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo MCXReporter reporter = {0.f, 0}; + if (cfg->issavedet) { + sharedmemsize = sizeof(float) * detreclen; + } + + param.reclen = sharedmemsize / sizeof(float); //< the shared memory buffer length associated with detected photon + + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sharedmemsize += sizeof(float) * cfg->srcnum; + } + + sharedmemsize *= ((int)gpu[gpuid].autoblock); #ifdef _OPENMP threadid = omp_get_thread_num(); @@ -502,17 +513,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo if (cfg->srctype == MCX_SRC_PATTERN) { CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern, - sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w))); + sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum))); CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern, - sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), + sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cudaMemcpyHostToDevice)); } else if (cfg->srctype == MCX_SRC_PATTERN3D) { CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y* - cfg->srcparam1.z))); + cfg->srcparam1.z * cfg->srcnum))); CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y* - cfg->srcparam1.z), + cfg->srcparam1.z * cfg->srcnum), cudaMemcpyHostToDevice)); } else { gsrcpattern = NULL; @@ -591,14 +602,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo param.tstart = twindow0; param.tend = twindow1; - // launch mcxkernel - size_t sharedMemSize = sizeof(int); - - if (cfg->issavedet) { - sharedMemSize = sizeof(float) * ((int)gpu[gpuid].autoblock) * detreclen; - } - mmc_main_loop <<< mcgrid, mcblock, sharedMemSize>>>( + mmc_main_loop <<< mcgrid, mcblock, sharedmemsize>>>( threadphoton, oddphotons, gnode, (int*)gelem, gweight, gdref, gtype, (int*)gfacenb, gsrcelem, gnormal, gdetphoton, gdetected, gseed, (int*)gprogress, genergy, greporter, @@ -752,7 +757,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { // accumulate field, can be done in the GPU - field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; } free(rawfield); @@ -780,8 +785,6 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp master { int i, j; - fieldlen = (fieldlen >> cfg->nbuffer); - field = (float*)realloc(field, sizeof(field[0]) * fieldlen); if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { diff --git a/src/mmc_raytrace.c b/src/mmc_raytrace.c index b0bea517..1a0e5d49 100644 --- a/src/mmc_raytrace.c +++ b/src/mmc_raytrace.c @@ -386,12 +386,11 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { #pragma omp atomic tracer->mesh->weight[eid + tshift] += ww; } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -437,13 +436,12 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { tracer->mesh->weight[ee[i] - 1 + tshift] += ww * (baryp0[i] + baryout[i]); } } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (i = 0; i < 4; i++) { #pragma omp atomic - tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx] * (baryp0[i] + baryout[i]); + tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx] * (baryp0[i] + baryout[i]); } } } @@ -716,12 +714,11 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { #pragma omp atomic tracer->mesh->weight[eid + tshift] += ww; } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } else { @@ -734,13 +731,12 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { tracer->mesh->weight[ee[j] - 1 + tshift] += barypout[j]; } } else if (cfg->srctype == stPattern) { - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (j = 0; j < 4; j++) { #pragma omp atomic - tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1405,7 +1401,6 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito medium* prop; int* enb, *ee = (int*)(tracer->mesh->elem + eid * tracer->mesh->elemlen); float mus; - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index if (cfg->implicit == 1 && r->inroi && tracer->mesh->edgeroi && fabs(tracer->mesh->edgeroi[eid * 6]) < EPS) { @@ -1526,7 +1521,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito } else if (cfg->srctype == stPattern) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } @@ -1543,7 +1538,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito } else if (cfg->srctype == stPattern) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } @@ -1577,7 +1572,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1596,7 +1591,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1621,7 +1616,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (i = 0; i < 3; i++) { #pragma omp atomic - tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1761,13 +1756,11 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg, } else if (cfg->srctype == stPattern) { *((int*)(r.partialpath + visit->reclen - 2)) = r.posidx; - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; - for (pidx = 0; pidx < cfg->srcnum; pidx++) { if (cfg->seed == SEED_FROM_FILE && (cfg->outputtype == otWL || cfg->outputtype == otWP)) { kahany = cfg->replayweight[r.photonid] - visit->kahanc0[pidx]; /* when replay mode, accumulate detected photon weight */ } else { - kahany = r.weight * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc0[pidx]; + kahany = r.weight * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc0[pidx]; } kahant = visit->launchweight[pidx] + kahany; @@ -2033,10 +2026,8 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg, visit->kahanc1[0] = (kahant - visit->absorbweight[0]) - kahany; visit->absorbweight[0] = kahant; } else if (cfg->srctype == stPattern) { - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; - for (pidx = 0; pidx < cfg->srcnum; pidx++) { - kahany = r.Eabsorb * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc1[pidx]; + kahany = r.Eabsorb * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc1[pidx]; kahant = visit->absorbweight[pidx] + kahany; visit->kahanc1[pidx] = (kahant - visit->absorbweight[pidx]) - kahany; visit->absorbweight[pidx] = kahant; diff --git a/src/mmc_utils.c b/src/mmc_utils.c index 9a8192d7..aab0d51a 100644 --- a/src/mmc_utils.c +++ b/src/mmc_utils.c @@ -326,7 +326,6 @@ void mcx_initcfg(mcconfig* cfg) { cfg->energyesc = 0.f; cfg->runtime = 0; cfg->autopilot = 1; - cfg->nbuffer = 0; cfg->gpuid = 0; cfg->maxjumpdebug = 10000000; cfg->exportdebugdata = NULL; @@ -3216,8 +3215,6 @@ void mcx_parsecmd(int argc, char* argv[], mcconfig* cfg) { i = mcx_readarg(argc, argv, i, &(cfg->debugphoton), "int"); } else if (strcmp(argv[i] + 2, "maxjumpdebug") == 0) { i = mcx_readarg(argc, argv, i, &(cfg->maxjumpdebug), "int"); - } else if (strcmp(argv[i] + 2, "buffer") == 0) { - i = mcx_readarg(argc, argv, i, &(cfg->nbuffer), "int"); } else if (strcmp(argv[i] + 2, "gridsize") == 0) { i = mcx_readarg(argc, argv, i, &(cfg->steps.x), "float"); cfg->steps.y = cfg->steps.x; diff --git a/src/mmc_utils.h b/src/mmc_utils.h index 6086cd41..0e0487ac 100644 --- a/src/mmc_utils.h +++ b/src/mmc_utils.h @@ -281,7 +281,6 @@ typedef struct MMC_config { unsigned int runtime; /**photonseed != NULL) { return; @@ -778,25 +777,26 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig* printf("mmc.session='%s';\n", cfg->session); } else if (strcmp(name, "srcpattern") == 0) { arraydim = mxGetDimensions(item); + dimtype dimz = 1; + + if (mxGetNumberOfDimensions(item) == 3) { + dimz = arraydim[2]; + cfg->srcnum = arraydim[0]; + } + double* val = mxGetPr(item); if (cfg->srcpattern) { free(cfg->srcpattern); } - if (mxGetNumberOfDimensions(item) == 3) { - cfg->srcnum = arraydim[2]; - } else { - cfg->srcnum = 1; - } - - cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * cfg->srcnum * sizeof(float)); + cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * dimz * sizeof(float)); - for (k = 0; k < arraydim[0]*arraydim[1]*cfg->srcnum; k++) { - cfg->srcpattern[k] = val[k]; + for (i = 0; i < arraydim[0]*arraydim[1]*dimz; i++) { + cfg->srcpattern[i] = val[i]; } - printf("mmc.srcpattern=[%d %d %d];\n", (int)arraydim[0], (int)arraydim[1], cfg->srcnum); + printf("mmc.srcpattern=[%ld %ld %ld];\n", arraydim[0], arraydim[1], dimz); } else if (strcmp(name, "method") == 0) { int len = mxGetNumberOfElements(item); const char* methods[] = {"plucker", "havel", "badouel", "elem", "grid", ""}; From 4e6bfad3d04a0afa004f16eed3f4a0a584251a58 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Fri, 11 Oct 2024 22:14:48 -0400 Subject: [PATCH 2/4] [bug] fix sharedmem buffer length error, thanks to ShijieYan --- src/mmc_cl_host.c | 2 +- src/mmc_core.cl | 2 +- src/mmclab.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c index 4879f263..8dc0a1b1 100644 --- a/src/mmc_cl_host.c +++ b/src/mmc_cl_host.c @@ -513,7 +513,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons))); //OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i)))); - OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autothread, NULL))); + OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autoblock, NULL))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i)))); diff --git a/src/mmc_core.cl b/src/mmc_core.cl index 975c09ae..72a97375 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -901,7 +901,7 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); if (oldval > MAX_ACCUM) { - if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); } else { atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); diff --git a/src/mmclab.cpp b/src/mmclab.cpp index 662b34fd..3d75fe88 100644 --- a/src/mmclab.cpp +++ b/src/mmclab.cpp @@ -777,7 +777,7 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig* printf("mmc.session='%s';\n", cfg->session); } else if (strcmp(name, "srcpattern") == 0) { arraydim = mxGetDimensions(item); - dimtype dimz = 1; + dimtype dimz = 1, k; if (mxGetNumberOfDimensions(item) == 3) { dimz = arraydim[2]; @@ -792,8 +792,8 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig* cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * dimz * sizeof(float)); - for (i = 0; i < arraydim[0]*arraydim[1]*dimz; i++) { - cfg->srcpattern[i] = val[i]; + for (k = 0; k < arraydim[0]*arraydim[1]*dimz; k++) { + cfg->srcpattern[k] = val[k]; } printf("mmc.srcpattern=[%ld %ld %ld];\n", arraydim[0], arraydim[1], dimz); From b3f97397a92dea9237f0b4769973228ae3976753 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Sun, 13 Oct 2024 11:57:08 -0400 Subject: [PATCH 3/4] [bug] fix cuda compilation errors --- src/mmc_core.cl | 8 ++++---- src/mmc_cu_host.cu | 4 ++-- src/mmc_utils.c | 4 ---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/mmc_core.cl b/src/mmc_core.cl index 72a97375..2f952017 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -506,7 +506,7 @@ __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCX __device__ void savedetphoton(__global float* n_det, __global uint* detectedphoton, __local float* ppath, ray* r, __constant Medium* gmed, int extdetid, __constant MCXParam* gcfg, __global RandType* photonseed, RandType* initseed) { - uint detid = (extdetid < 0) ? finddetector(p0, (__constant float4*)gmed, gcfg) : extdetid; + uint detid = (extdetid < 0) ? finddetector(&(r->p0), (__constant float4*)gmed, gcfg) : extdetid; if (detid) { uint baseaddr = atomic_inc(detectedphoton); @@ -535,9 +535,9 @@ __device__ void savedetphoton(__global float* n_det, __global uint* detectedphot n_det[baseaddr++] = r->p0.x; n_det[baseaddr++] = r->p0.y; n_det[baseaddr++] = r->p0.z; - n_det[baseaddr++] = r->v.x; - n_det[baseaddr++] = r->v.y; - n_det[baseaddr++] = r->v.z; + n_det[baseaddr++] = r->vec.x; + n_det[baseaddr++] = r->vec.y; + n_det[baseaddr++] = r->vec.z; } n_det[baseaddr++] = ppath[GPU_PARAM(gcfg, reclen) - 1]; // save partial pathlength to the memory diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu index e50a1e61..f14102d4 100644 --- a/src/mmc_cu_host.cu +++ b/src/mmc_cu_host.cu @@ -300,6 +300,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo sharedmemsize += sizeof(float) * cfg->srcnum; } + gpuid = cfg->deviceid[threadid] - 1; + sharedmemsize *= ((int)gpu[gpuid].autoblock); #ifdef _OPENMP @@ -310,8 +312,6 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo return; } - gpuid = cfg->deviceid[threadid] - 1; - if (gpuid < 0) { mcx_error(-1, "GPU ID must be non-zero", __FILE__, __LINE__); } diff --git a/src/mmc_utils.c b/src/mmc_utils.c index aab0d51a..404a149a 100644 --- a/src/mmc_utils.c +++ b/src/mmc_utils.c @@ -2847,10 +2847,6 @@ void mcx_validatecfg(mcconfig* cfg) { MMC_ERROR(-2, "Implicit MMC is currently only supported in the CPU, please set -G -1 or cfg.gpuid=-1"); } - if (cfg->srcnum > 1 && (int)(cfg->gpuid) >= 0) { - MMC_ERROR(-2, "Photon-sharing MMC is currently only supported in the CPU, please set -G -1 or cfg.gpuid=-1"); - } - for (i = 0; i < MAX_DEVICE; i++) if (cfg->deviceid[i] == '0') { cfg->deviceid[i] = '\0'; From 072fafe4305cd3c2da78094b0622dc4c67267983 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Mon, 14 Oct 2024 22:06:17 -0400 Subject: [PATCH 4/4] [bug] fix gpu photon-sharing hanging, support mmc binary mode, close #86 --- mmclab/mmc2json.m | 14 +-------- src/mmc_cl_host.c | 73 ++++++++++++++++++++++++++++++-------------- src/mmc_core.cl | 39 +++++++++++++++--------- src/mmc_cu_host.cu | 75 +++++++++++++++++++++++++++++++--------------- src/mmc_utils.c | 44 +++++++++++++++++++++++++-- src/mmc_utils.h | 4 ++- 6 files changed, 171 insertions(+), 78 deletions(-) diff --git a/mmclab/mmc2json.m b/mmclab/mmc2json.m index 7f5d8211..90f1839b 100644 --- a/mmclab/mmc2json.m +++ b/mmclab/mmc2json.m @@ -13,9 +13,6 @@ % mcxpreview supports the cfg input for both mcxlab and mmclab. % filestub: the filestub is the name stub for all output files,including % filestub.json: the JSON input file -% filestub_vol.bin: the volume file if cfg.vol is defined -% filestub_shapes.json: the domain shape file if cfg.shapes is defined -% filestub_pattern.bin: the domain shape file if cfg.pattern is defined % % if filestub ends with '.json', then mmc2json saves the mesh data % in the single-file mode, and the mesh information is stored @@ -58,16 +55,7 @@ end end if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern)) - Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1); - Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2); - Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3); - Optode.Source.Pattern.Data = single(cfg.srcpattern); - if (~singlefile) - Optode.Source.Pattern.Data = [filestub '_pattern.bin']; - fid = fopen(Optode.Source.Pattern.Data, 'wb'); - fwrite(fid, cfg.srcpattern, 'float32'); - fclose(fid); - end + Optode.Source.Pattern = single(cfg.srcpattern); end %% define the domain and optical properties diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c index 8dc0a1b1..b98f30a7 100644 --- a/src/mmc_cl_host.c +++ b/src/mmc_cl_host.c @@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/ cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/ - cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; - cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer + cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; /**< total output data length in float count per time-frame */ + cfg->crop0.w = meshlen * cfg->maxgate; /**< total output data length, before double-buffer expansion */ cl_float* field, *dref = NULL; @@ -107,6 +107,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { GPUInfo* gpu = NULL; float4* propdet; + double energytot = 0.0, energyesc = 0.0; MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}}, cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit, @@ -124,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout, cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}}, mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}}, - cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, + cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, cfg->seed, cfg->maxjumpdebug }; @@ -146,6 +147,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { param.reclen = sharedmemsize / sizeof(float); + sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1); /**< store energyesc/energytot */ + if (cfg->srctype == stPattern && cfg->srcnum > 1) { sharedmemsize += sizeof(cl_float) * cfg->srcnum; } @@ -256,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN)); } - fieldlen = meshlen * cfg->maxgate; + fieldlen = cfg->crop0.w; /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */ if (cfg->seed > 0) { srand(cfg->seed); @@ -310,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), ¶m, &status), status))); Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN); - energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1); + energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1); for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) { Pseed[j] = rand(); @@ -331,7 +334,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status))); } - OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status))); + OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status))); OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status))); OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status))); @@ -509,6 +512,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons, cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin); + MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock); + OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons))); @@ -556,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug)); } - cfg->energytot = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double)); + cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double)); + energytot = 0.0; + energyesc = 0.0; cfg->runtime = 0; //simulate for all time-gates in maxgate groups per run @@ -636,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { reporter.raytet += rep.raytet; reporter.jumpdebug += rep.jumpdebug; - energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1); - OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1), + energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1); + OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum, energy, 0, NULL, NULL))); for (i = 0; i < gpu[devid].autothread; i++) { - cfg->energyesc += energy[(i << 1)]; - cfg->energytot += energy[(i << 1) + 1]; + for (j = 0; j < cfg->srcnum; j++) { + cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j]; + cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j]; + energyesc += energy[(i << 1) * cfg->srcnum + j]; + energytot += energy[((i << 1) + 1) * cfg->srcnum + j]; + } } free(energy); @@ -707,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen, rawdref, 0, NULL, NULL))); + //TODO: saving dref has not yet adopting double-buffer for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU dref[i] += rawdref[i]; //+rawfield[i+fieldlen]; } @@ -724,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU - field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; } free(rawfield); @@ -754,15 +766,20 @@ is more than what your have specified (%d), please use the -H option to specify cfg->exportfield[i] += field[i]; } } else { - for (i = 0; i < cfg->maxgate; i++) + int srcid; + + for (i = 0; i < cfg->maxgate; i++) { for (j = 0; j < mesh->ne; j++) { - float ww = field[i * mesh->ne + j] * 0.25f; - int k; + for (srcid = 0; srcid < cfg->srcnum; srcid++) { + float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f; + int k; - for (k = 0; k < mesh->elemlen; k++) { - cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww; + for (k = 0; k < mesh->elemlen; k++) { + cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww; + } } } + } } } @@ -773,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify } if (cfg->isnormalized) { - MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t"); - mcx_fflush(cfg->flog); + double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0; + + for (j = 0; j < cfg->srcnum; j++) { + energyabs = cfg->energytot[j] - cfg->energyesc[j]; + cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j); + sum_normalizer += cur_normalizer; + MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n", + j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer)); + } - cfg->energyabs = cfg->energytot - cfg->energyesc; - mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0); + cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources } #ifndef MCX_CONTAINER @@ -817,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n", cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc); MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n", - cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f); + energytot, (energytot - energyesc) / energytot * 100.f); mcx_fflush(cfg->flog); OCL_ASSERT(clReleaseMemObject(gprogress[0])); @@ -899,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify free(mcxkernel); free(waittoread); + free(cfg->energytot); + free(cfg->energyesc); + cfg->energytot = NULL; + cfg->energyesc = NULL; if (gpu) { free(gpu); diff --git a/src/mmc_core.cl b/src/mmc_core.cl index 2f952017..0bc30bab 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -47,6 +47,9 @@ inline __device__ __host__ int get_global_id(int idx) { inline __device__ __host__ int get_local_id(int idx) { return (idx == 0) ? threadIdx.x : ( (idx == 1) ? threadIdx.y : threadIdx.z ); } +inline __device__ __host__ int get_local_size(int idx) { + return (idx == 0) ? blockDim.x : ( (idx == 1) ? blockDim.y : blockDim.z ); +} inline __device__ __host__ float3 operator *(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } @@ -480,7 +483,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float #endif -#ifdef MCX_SAVE_DETECTORS __device__ void clearpath(__local float* p, int len) { uint i; @@ -489,6 +491,7 @@ __device__ void clearpath(__local float* p, int len) { } } +#ifdef MCX_SAVE_DETECTORS __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) { uint i; @@ -1443,7 +1446,7 @@ __device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global floa __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXParam* gcfg, __global float3* node, __global int* elem, __global float* weight, __global float* dref, __global int* type, __global int* facenb, __global int* srcelem, __global float4* normal, __constant Medium* gmed, - __global float* n_det, __global uint* detectedphoton, float* energytot, float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern, + __global float* n_det, __global uint* detectedphoton, __local float* energytot, __local float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern, __global float* replayweight, __global float* replaytime, __global RandType* photonseed, __global MCXReporter* reporter, __global float* gdebugdata) { int oldeid, fixcount = 0; @@ -1464,12 +1467,10 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP /*initialize the photon parameters*/ launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); - *energytot += r.weight; + #ifdef MCX_SAVE_DETECTORS if (GPU_PARAM(gcfg, issavedet)) { - clearpath(ppath, GPU_PARAM(gcfg, reclen)); /*clear shared memory for saving history of a new photon*/ - if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ } else if (GPU_PARAM(gcfg, srctype) == stPattern) { @@ -1479,9 +1480,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #endif - if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) { - for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) { + if (GPU_PARAM(gcfg, srcnum) == 1) { + *energytot += r.weight; + } else { + for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) { ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid]; + energytot[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid]; } } @@ -1700,7 +1704,9 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP savedebugdata(&r, id, reporter, gdebugdata, gcfg); } - *energyesc += r.weight; + for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) { + energyesc[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid]; + } } __kernel void mmc_main_loop(const int nphoton, const int ophoton, @@ -1714,7 +1720,6 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, RandType t[RAND_BUF_LEN]; int idx = get_global_id(0); - float energyesc = 0.f, energytot = 0.f; int raytet = 0; #ifdef __NVCC__ @@ -1725,6 +1730,9 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, gpu_rng_init(t, n_seed, idx); } + clearpath(sharedmem, get_local_size(0) * ((GPU_PARAM(gcfg, srcnum) << 1) + + (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)))); + /*launch photons*/ for (int i = 0; i < nphoton + (idx < ophoton); i++) { if (GPU_PARAM(gcfg, seed) == SEED_FROM_FILE) @@ -1732,14 +1740,17 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j]; } - onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) * - (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, - weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet, + onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_size(0) * (GPU_PARAM(gcfg, srcnum) << 1) + + get_local_id(0) * (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, + weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, sharedmem + get_local_id(0) * GPU_PARAM(gcfg, srcnum), + sharedmem + (get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum), t, &raytet, srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata); } - energy[idx << 1] = energyesc; - energy[1 + (idx << 1)] = energytot; + for (int i = 0; i < GPU_PARAM(gcfg, srcnum); i++) { + energy[(idx << 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[(get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum) + i]; + energy[((idx << 1) + 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[get_local_id(0) * GPU_PARAM(gcfg, srcnum) + i]; + } if (GPU_PARAM(gcfg, debuglevel) & MCX_DEBUG_PROGRESS && progress) { atomic_inc(progress); diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu index f14102d4..d6290a95 100644 --- a/src/mmc_cu_host.cu +++ b/src/mmc_cu_host.cu @@ -238,6 +238,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo uint hostdetreclen = detreclen + 1; // launch mcxkernel size_t sharedmemsize = 0; + double energytot = 0.0, energyesc = 0.0; MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z), make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z), @@ -280,7 +281,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->bary0, cfg->e0, cfg->isextdet, - (int)meshlen, + (int)(meshlen / cfg->srcnum), (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, @@ -296,6 +297,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo param.reclen = sharedmemsize / sizeof(float); //< the shared memory buffer length associated with detected photon + sharedmemsize += sizeof(float) * (cfg->srcnum << 1); /**< store energyesc/energytot */ + if (cfg->srctype == stPattern && cfg->srcnum > 1) { sharedmemsize += sizeof(float) * cfg->srcnum; } @@ -332,8 +335,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->exportseed = (unsigned char*)malloc(cfg->maxdetphoton * (sizeof(RandType) * RAND_BUF_LEN)); } - cfg->energytot = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double)); + cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double)); cfg->runtime = 0; } #pragma omp barrier @@ -472,7 +475,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo *progress = 0; Pseed = (uint*)malloc(sizeof(uint) * gpu[gpuid].autothread * RAND_SEED_WORD_LEN); - energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1); + energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1); for (j = 0; j < gpu[gpuid].autothread * RAND_SEED_WORD_LEN; j++) { Pseed[j] = rand(); @@ -499,9 +502,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cudaMemcpyHostToDevice)); CUDA_ASSERT(cudaMalloc((void**)&genergy, - sizeof(float) * (gpu[gpuid].autothread << 1))); + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum)); CUDA_ASSERT(cudaMemcpy(genergy, energy, - sizeof(float) * (gpu[gpuid].autothread << 1), + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum, cudaMemcpyHostToDevice)); CUDA_ASSERT(cudaMalloc((void**)&gdetected, sizeof(uint))); @@ -591,6 +594,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo MMC_FPRINTF(cfg->flog, "lauching mcx_main_loop for time window [%.1fns %.1fns] ...\n", twindow0 * 1e9, twindow1 * 1e9); + + MMC_FPRINTF(cfg->flog, "requesting %ld bytes of shared memory\n", sharedmemsize); + mcx_fflush(cfg->flog); // total number of repetition for the simulations, results will be @@ -650,16 +656,21 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo reporter.raytet += rep.raytet; reporter.jumpdebug += rep.jumpdebug; - energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1); + energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1); CUDA_ASSERT(cudaMemcpy(energy, genergy, - sizeof(float) * (gpu[gpuid].autothread << 1), + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum, cudaMemcpyDeviceToHost)); #pragma omp critical { + for (i = 0; i < gpu[gpuid].autothread; i++) { - cfg->energyesc += energy[(i << 1)]; - cfg->energytot += energy[(i << 1) + 1]; + for (j = 0; j < (uint) cfg->srcnum; j++) { + cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j]; + cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j]; + energyesc += energy[(i << 1) * cfg->srcnum + j]; + energytot += energy[((i << 1) + 1) * cfg->srcnum + j]; + } } } @@ -784,7 +795,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp master { - int i, j; + int i, j, srcid; if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { @@ -792,14 +803,18 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp atomic cfg->exportfield[i] += field[i]; } else { - for (i = 0; i < cfg->maxgate; i++) + for (i = 0; i < cfg->maxgate; i++) { for (j = 0; j < mesh->ne; j++) { - float ww = field[i * mesh->ne + j] * 0.25f; + for (srcid = 0; srcid < cfg->srcnum; srcid++) { + float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f; + int k; - for (int k = 0; k < mesh->elemlen; k++) - cfg->exportfield[i * mesh->nn + - mesh->elem[j * mesh->elemlen + k] - 1] += ww; + for (k = 0; k < mesh->elemlen; k++) { + cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww; + } + } } + } } } @@ -810,11 +825,17 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio } if (cfg->isnormalized) { - MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t"); - mcx_fflush(cfg->flog); + double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0; + + for (j = 0; j < cfg->srcnum; j++) { + energyabs = cfg->energytot[j] - cfg->energyesc[j]; + cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j); + sum_normalizer += cur_normalizer; + MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: " S_BOLD "" S_BLUE "%5.5f%%" S_RESET "\tnormalizor=%g\n", + j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer)); + } - cfg->energyabs = cfg->energytot - cfg->energyesc; - mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0); + cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources } #ifndef MCX_CONTAINER @@ -868,8 +889,8 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to " "initial specular reflection is excluded in the total)\n", - cfg->energytot, - (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f); + energytot, + (energytot - energyesc) / energytot * 100.f); mcx_fflush(cfg->flog); } #pragma omp barrier @@ -913,9 +934,15 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio CUDA_ASSERT(cudaFree(greporter)); #pragma omp master + { + if (gpu) { + free(gpu); + } - if (gpu) { - free(gpu); + free(cfg->energytot); + free(cfg->energyesc); + cfg->energytot = NULL; + cfg->energyesc = NULL; } free(field); diff --git a/src/mmc_utils.c b/src/mmc_utils.c index 404a149a..e87fb095 100644 --- a/src/mmc_utils.c +++ b/src/mmc_utils.c @@ -321,9 +321,8 @@ void mcx_initcfg(mcconfig* cfg) { cfg->exportdetected = NULL; cfg->exportseed = NULL; cfg->detectedcount = 0; - cfg->energytot = 0.f; - cfg->energyabs = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = NULL; + cfg->energyesc = NULL; cfg->runtime = 0; cfg->autopilot = 1; cfg->gpuid = 0; @@ -421,6 +420,14 @@ void mcx_clearcfg(mcconfig* cfg) { free(cfg->roidata); } + if (cfg->energytot) { + free(cfg->energytot); + } + + if (cfg->energyesc) { + free(cfg->energyesc); + } + #ifndef MCX_EMBED_CL if (cfg->clsource && cfg->clsource != (char*)mmc_core_cl) { @@ -1582,6 +1589,8 @@ int mcx_loadjson(cJSON* root, mcconfig* cfg) { if (ndim == 3 && dims[2] > 1 && dims[0] > 1 && cfg->srctype == MCX_SRC_PATTERN) { cfg->srcnum = dims[0]; } + + mcx_convertrow2col(cfg->srcpattern, (uint3*)dims); } else { int nx = FIND_JSON_KEY("Nx", "Optode.Source.Pattern.Nx", subitem, 0, valueint); int ny = FIND_JSON_KEY("Ny", "Optode.Source.Pattern.Ny", subitem, 0, valueint); @@ -2776,6 +2785,35 @@ int mcx_isbinstr(const char* str) { return 1; } +/** + * @brief Convert a row-major (C/C++) array to a column-major (MATLAB/FORTRAN) array + * + * @param[in,out] vol: a 3D array (wrapped in 1D) to be converted + * @param[in] dim: the dimensions of the 3D array + */ + +void mcx_convertrow2col(float* vol, uint3* dim) { + uint x, y, z; + unsigned int dimxy, dimyz; + float* newvol = NULL; + + if (vol == NULL || dim->x == 0 || dim->y == 0 || dim->z == 0) { + return; + } + + newvol = (float*)malloc(sizeof(float) * dim->x * dim->y * dim->z); + dimxy = dim->x * dim->y; + dimyz = dim->y * dim->z; + + for (x = 0; x < dim->x; x++) + for (y = 0; y < dim->y; y++) + for (z = 0; z < dim->z; z++) { + newvol[z * dimxy + y * dim->x + x] = vol[x * dimyz + y * dim->z + z]; + } + + memcpy(vol, newvol, sizeof(float) * dim->x * dim->y * dim->z); + free(newvol); +} /** * @brief Validate all input fields, and warn incompatible inputs diff --git a/src/mmc_utils.h b/src/mmc_utils.h index 0e0487ac..ffd7aa23 100644 --- a/src/mmc_utils.h +++ b/src/mmc_utils.h @@ -276,7 +276,8 @@ typedef struct MMC_config { unsigned char* exportseed; /*memory buffer when returning the RNG seed to matlab*/ float* exportdetected; /*memory buffer when returning the partial length info to external programs such as matlab*/ float* exportdebugdata; /**