diff --git a/mmclab/mmc2json.m b/mmclab/mmc2json.m index 7f5d821..90f1839 100644 --- a/mmclab/mmc2json.m +++ b/mmclab/mmc2json.m @@ -13,9 +13,6 @@ % mcxpreview supports the cfg input for both mcxlab and mmclab. % filestub: the filestub is the name stub for all output files,including % filestub.json: the JSON input file -% filestub_vol.bin: the volume file if cfg.vol is defined -% filestub_shapes.json: the domain shape file if cfg.shapes is defined -% filestub_pattern.bin: the domain shape file if cfg.pattern is defined % % if filestub ends with '.json', then mmc2json saves the mesh data % in the single-file mode, and the mesh information is stored @@ -58,16 +55,7 @@ end end if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern)) - Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1); - Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2); - Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3); - Optode.Source.Pattern.Data = single(cfg.srcpattern); - if (~singlefile) - Optode.Source.Pattern.Data = [filestub '_pattern.bin']; - fid = fopen(Optode.Source.Pattern.Data, 'wb'); - fwrite(fid, cfg.srcpattern, 'float32'); - fclose(fid); - end + Optode.Source.Pattern = single(cfg.srcpattern); end %% define the domain and optical properties diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c index 8dc0a1b..b98f30a 100644 --- a/src/mmc_cl_host.c +++ b/src/mmc_cl_host.c @@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/ cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/ - cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; - cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer + cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; /**< total output data length in float count per time-frame */ + cfg->crop0.w = meshlen * cfg->maxgate; /**< total output data length, before double-buffer expansion */ cl_float* field, *dref = NULL; @@ -107,6 +107,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { GPUInfo* gpu = NULL; float4* propdet; + double energytot = 0.0, energyesc = 0.0; MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}}, cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit, @@ -124,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout, cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}}, mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}}, - cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, + cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, cfg->seed, cfg->maxjumpdebug }; @@ -146,6 +147,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { param.reclen = sharedmemsize / sizeof(float); + sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1); /**< store energyesc/energytot */ + if (cfg->srctype == stPattern && cfg->srcnum > 1) { sharedmemsize += sizeof(cl_float) * cfg->srcnum; } @@ -256,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN)); } - fieldlen = meshlen * cfg->maxgate; + fieldlen = cfg->crop0.w; /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */ if (cfg->seed > 0) { srand(cfg->seed); @@ -310,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), ¶m, &status), status))); Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN); - energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1); + energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1); for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) { Pseed[j] = rand(); @@ -331,7 +334,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status))); } - OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status))); + OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status))); OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status))); OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status))); @@ -509,6 +512,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons, cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin); + MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock); + OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons))); @@ -556,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug)); } - cfg->energytot = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double)); + cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double)); + energytot = 0.0; + energyesc = 0.0; cfg->runtime = 0; //simulate for all time-gates in maxgate groups per run @@ -636,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { reporter.raytet += rep.raytet; reporter.jumpdebug += rep.jumpdebug; - energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1); - OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1), + energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1); + OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum, energy, 0, NULL, NULL))); for (i = 0; i < gpu[devid].autothread; i++) { - cfg->energyesc += energy[(i << 1)]; - cfg->energytot += energy[(i << 1) + 1]; + for (j = 0; j < cfg->srcnum; j++) { + cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j]; + cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j]; + energyesc += energy[(i << 1) * cfg->srcnum + j]; + energytot += energy[((i << 1) + 1) * cfg->srcnum + j]; + } } free(energy); @@ -707,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen, rawdref, 0, NULL, NULL))); + //TODO: saving dref has not yet adopting double-buffer for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU dref[i] += rawdref[i]; //+rawfield[i+fieldlen]; } @@ -724,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU - field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; } free(rawfield); @@ -754,15 +766,20 @@ is more than what your have specified (%d), please use the -H option to specify cfg->exportfield[i] += field[i]; } } else { - for (i = 0; i < cfg->maxgate; i++) + int srcid; + + for (i = 0; i < cfg->maxgate; i++) { for (j = 0; j < mesh->ne; j++) { - float ww = field[i * mesh->ne + j] * 0.25f; - int k; + for (srcid = 0; srcid < cfg->srcnum; srcid++) { + float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f; + int k; - for (k = 0; k < mesh->elemlen; k++) { - cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww; + for (k = 0; k < mesh->elemlen; k++) { + cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww; + } } } + } } } @@ -773,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify } if (cfg->isnormalized) { - MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t"); - mcx_fflush(cfg->flog); + double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0; + + for (j = 0; j < cfg->srcnum; j++) { + energyabs = cfg->energytot[j] - cfg->energyesc[j]; + cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j); + sum_normalizer += cur_normalizer; + MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n", + j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer)); + } - cfg->energyabs = cfg->energytot - cfg->energyesc; - mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0); + cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources } #ifndef MCX_CONTAINER @@ -817,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n", cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc); MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n", - cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f); + energytot, (energytot - energyesc) / energytot * 100.f); mcx_fflush(cfg->flog); OCL_ASSERT(clReleaseMemObject(gprogress[0])); @@ -899,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify free(mcxkernel); free(waittoread); + free(cfg->energytot); + free(cfg->energyesc); + cfg->energytot = NULL; + cfg->energyesc = NULL; if (gpu) { free(gpu); diff --git a/src/mmc_core.cl b/src/mmc_core.cl index 2f95201..0bc30ba 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -47,6 +47,9 @@ inline __device__ __host__ int get_global_id(int idx) { inline __device__ __host__ int get_local_id(int idx) { return (idx == 0) ? threadIdx.x : ( (idx == 1) ? threadIdx.y : threadIdx.z ); } +inline __device__ __host__ int get_local_size(int idx) { + return (idx == 0) ? blockDim.x : ( (idx == 1) ? blockDim.y : blockDim.z ); +} inline __device__ __host__ float3 operator *(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } @@ -480,7 +483,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float #endif -#ifdef MCX_SAVE_DETECTORS __device__ void clearpath(__local float* p, int len) { uint i; @@ -489,6 +491,7 @@ __device__ void clearpath(__local float* p, int len) { } } +#ifdef MCX_SAVE_DETECTORS __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) { uint i; @@ -1443,7 +1446,7 @@ __device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global floa __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXParam* gcfg, __global float3* node, __global int* elem, __global float* weight, __global float* dref, __global int* type, __global int* facenb, __global int* srcelem, __global float4* normal, __constant Medium* gmed, - __global float* n_det, __global uint* detectedphoton, float* energytot, float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern, + __global float* n_det, __global uint* detectedphoton, __local float* energytot, __local float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern, __global float* replayweight, __global float* replaytime, __global RandType* photonseed, __global MCXReporter* reporter, __global float* gdebugdata) { int oldeid, fixcount = 0; @@ -1464,12 +1467,10 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP /*initialize the photon parameters*/ launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); - *energytot += r.weight; + #ifdef MCX_SAVE_DETECTORS if (GPU_PARAM(gcfg, issavedet)) { - clearpath(ppath, GPU_PARAM(gcfg, reclen)); /*clear shared memory for saving history of a new photon*/ - if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ } else if (GPU_PARAM(gcfg, srctype) == stPattern) { @@ -1479,9 +1480,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #endif - if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) { - for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) { + if (GPU_PARAM(gcfg, srcnum) == 1) { + *energytot += r.weight; + } else { + for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) { ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid]; + energytot[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid]; } } @@ -1700,7 +1704,9 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP savedebugdata(&r, id, reporter, gdebugdata, gcfg); } - *energyesc += r.weight; + for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) { + energyesc[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid]; + } } __kernel void mmc_main_loop(const int nphoton, const int ophoton, @@ -1714,7 +1720,6 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, RandType t[RAND_BUF_LEN]; int idx = get_global_id(0); - float energyesc = 0.f, energytot = 0.f; int raytet = 0; #ifdef __NVCC__ @@ -1725,6 +1730,9 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, gpu_rng_init(t, n_seed, idx); } + clearpath(sharedmem, get_local_size(0) * ((GPU_PARAM(gcfg, srcnum) << 1) + + (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)))); + /*launch photons*/ for (int i = 0; i < nphoton + (idx < ophoton); i++) { if (GPU_PARAM(gcfg, seed) == SEED_FROM_FILE) @@ -1732,14 +1740,17 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j]; } - onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) * - (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, - weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet, + onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_size(0) * (GPU_PARAM(gcfg, srcnum) << 1) + + get_local_id(0) * (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, + weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, sharedmem + get_local_id(0) * GPU_PARAM(gcfg, srcnum), + sharedmem + (get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum), t, &raytet, srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata); } - energy[idx << 1] = energyesc; - energy[1 + (idx << 1)] = energytot; + for (int i = 0; i < GPU_PARAM(gcfg, srcnum); i++) { + energy[(idx << 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[(get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum) + i]; + energy[((idx << 1) + 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[get_local_id(0) * GPU_PARAM(gcfg, srcnum) + i]; + } if (GPU_PARAM(gcfg, debuglevel) & MCX_DEBUG_PROGRESS && progress) { atomic_inc(progress); diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu index f14102d..d6290a9 100644 --- a/src/mmc_cu_host.cu +++ b/src/mmc_cu_host.cu @@ -238,6 +238,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo uint hostdetreclen = detreclen + 1; // launch mcxkernel size_t sharedmemsize = 0; + double energytot = 0.0, energyesc = 0.0; MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z), make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z), @@ -280,7 +281,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->bary0, cfg->e0, cfg->isextdet, - (int)meshlen, + (int)(meshlen / cfg->srcnum), (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, @@ -296,6 +297,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo param.reclen = sharedmemsize / sizeof(float); //< the shared memory buffer length associated with detected photon + sharedmemsize += sizeof(float) * (cfg->srcnum << 1); /**< store energyesc/energytot */ + if (cfg->srctype == stPattern && cfg->srcnum > 1) { sharedmemsize += sizeof(float) * cfg->srcnum; } @@ -332,8 +335,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->exportseed = (unsigned char*)malloc(cfg->maxdetphoton * (sizeof(RandType) * RAND_BUF_LEN)); } - cfg->energytot = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double)); + cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double)); cfg->runtime = 0; } #pragma omp barrier @@ -472,7 +475,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo *progress = 0; Pseed = (uint*)malloc(sizeof(uint) * gpu[gpuid].autothread * RAND_SEED_WORD_LEN); - energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1); + energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1); for (j = 0; j < gpu[gpuid].autothread * RAND_SEED_WORD_LEN; j++) { Pseed[j] = rand(); @@ -499,9 +502,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cudaMemcpyHostToDevice)); CUDA_ASSERT(cudaMalloc((void**)&genergy, - sizeof(float) * (gpu[gpuid].autothread << 1))); + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum)); CUDA_ASSERT(cudaMemcpy(genergy, energy, - sizeof(float) * (gpu[gpuid].autothread << 1), + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum, cudaMemcpyHostToDevice)); CUDA_ASSERT(cudaMalloc((void**)&gdetected, sizeof(uint))); @@ -591,6 +594,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo MMC_FPRINTF(cfg->flog, "lauching mcx_main_loop for time window [%.1fns %.1fns] ...\n", twindow0 * 1e9, twindow1 * 1e9); + + MMC_FPRINTF(cfg->flog, "requesting %ld bytes of shared memory\n", sharedmemsize); + mcx_fflush(cfg->flog); // total number of repetition for the simulations, results will be @@ -650,16 +656,21 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo reporter.raytet += rep.raytet; reporter.jumpdebug += rep.jumpdebug; - energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1); + energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1); CUDA_ASSERT(cudaMemcpy(energy, genergy, - sizeof(float) * (gpu[gpuid].autothread << 1), + sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum, cudaMemcpyDeviceToHost)); #pragma omp critical { + for (i = 0; i < gpu[gpuid].autothread; i++) { - cfg->energyesc += energy[(i << 1)]; - cfg->energytot += energy[(i << 1) + 1]; + for (j = 0; j < (uint) cfg->srcnum; j++) { + cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j]; + cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j]; + energyesc += energy[(i << 1) * cfg->srcnum + j]; + energytot += energy[((i << 1) + 1) * cfg->srcnum + j]; + } } } @@ -784,7 +795,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp master { - int i, j; + int i, j, srcid; if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { @@ -792,14 +803,18 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp atomic cfg->exportfield[i] += field[i]; } else { - for (i = 0; i < cfg->maxgate; i++) + for (i = 0; i < cfg->maxgate; i++) { for (j = 0; j < mesh->ne; j++) { - float ww = field[i * mesh->ne + j] * 0.25f; + for (srcid = 0; srcid < cfg->srcnum; srcid++) { + float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f; + int k; - for (int k = 0; k < mesh->elemlen; k++) - cfg->exportfield[i * mesh->nn + - mesh->elem[j * mesh->elemlen + k] - 1] += ww; + for (k = 0; k < mesh->elemlen; k++) { + cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww; + } + } } + } } } @@ -810,11 +825,17 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio } if (cfg->isnormalized) { - MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t"); - mcx_fflush(cfg->flog); + double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0; + + for (j = 0; j < cfg->srcnum; j++) { + energyabs = cfg->energytot[j] - cfg->energyesc[j]; + cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j); + sum_normalizer += cur_normalizer; + MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: " S_BOLD "" S_BLUE "%5.5f%%" S_RESET "\tnormalizor=%g\n", + j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer)); + } - cfg->energyabs = cfg->energytot - cfg->energyesc; - mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0); + cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources } #ifndef MCX_CONTAINER @@ -868,8 +889,8 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to " "initial specular reflection is excluded in the total)\n", - cfg->energytot, - (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f); + energytot, + (energytot - energyesc) / energytot * 100.f); mcx_fflush(cfg->flog); } #pragma omp barrier @@ -913,9 +934,15 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio CUDA_ASSERT(cudaFree(greporter)); #pragma omp master + { + if (gpu) { + free(gpu); + } - if (gpu) { - free(gpu); + free(cfg->energytot); + free(cfg->energyesc); + cfg->energytot = NULL; + cfg->energyesc = NULL; } free(field); diff --git a/src/mmc_utils.c b/src/mmc_utils.c index 404a149..e87fb09 100644 --- a/src/mmc_utils.c +++ b/src/mmc_utils.c @@ -321,9 +321,8 @@ void mcx_initcfg(mcconfig* cfg) { cfg->exportdetected = NULL; cfg->exportseed = NULL; cfg->detectedcount = 0; - cfg->energytot = 0.f; - cfg->energyabs = 0.f; - cfg->energyesc = 0.f; + cfg->energytot = NULL; + cfg->energyesc = NULL; cfg->runtime = 0; cfg->autopilot = 1; cfg->gpuid = 0; @@ -421,6 +420,14 @@ void mcx_clearcfg(mcconfig* cfg) { free(cfg->roidata); } + if (cfg->energytot) { + free(cfg->energytot); + } + + if (cfg->energyesc) { + free(cfg->energyesc); + } + #ifndef MCX_EMBED_CL if (cfg->clsource && cfg->clsource != (char*)mmc_core_cl) { @@ -1582,6 +1589,8 @@ int mcx_loadjson(cJSON* root, mcconfig* cfg) { if (ndim == 3 && dims[2] > 1 && dims[0] > 1 && cfg->srctype == MCX_SRC_PATTERN) { cfg->srcnum = dims[0]; } + + mcx_convertrow2col(cfg->srcpattern, (uint3*)dims); } else { int nx = FIND_JSON_KEY("Nx", "Optode.Source.Pattern.Nx", subitem, 0, valueint); int ny = FIND_JSON_KEY("Ny", "Optode.Source.Pattern.Ny", subitem, 0, valueint); @@ -2776,6 +2785,35 @@ int mcx_isbinstr(const char* str) { return 1; } +/** + * @brief Convert a row-major (C/C++) array to a column-major (MATLAB/FORTRAN) array + * + * @param[in,out] vol: a 3D array (wrapped in 1D) to be converted + * @param[in] dim: the dimensions of the 3D array + */ + +void mcx_convertrow2col(float* vol, uint3* dim) { + uint x, y, z; + unsigned int dimxy, dimyz; + float* newvol = NULL; + + if (vol == NULL || dim->x == 0 || dim->y == 0 || dim->z == 0) { + return; + } + + newvol = (float*)malloc(sizeof(float) * dim->x * dim->y * dim->z); + dimxy = dim->x * dim->y; + dimyz = dim->y * dim->z; + + for (x = 0; x < dim->x; x++) + for (y = 0; y < dim->y; y++) + for (z = 0; z < dim->z; z++) { + newvol[z * dimxy + y * dim->x + x] = vol[x * dimyz + y * dim->z + z]; + } + + memcpy(vol, newvol, sizeof(float) * dim->x * dim->y * dim->z); + free(newvol); +} /** * @brief Validate all input fields, and warn incompatible inputs diff --git a/src/mmc_utils.h b/src/mmc_utils.h index 0e0487a..ffd7aa2 100644 --- a/src/mmc_utils.h +++ b/src/mmc_utils.h @@ -276,7 +276,8 @@ typedef struct MMC_config { unsigned char* exportseed; /*memory buffer when returning the RNG seed to matlab*/ float* exportdetected; /*memory buffer when returning the partial length info to external programs such as matlab*/ float* exportdebugdata; /**