diff --git a/mmclab/mmc2json.m b/mmclab/mmc2json.m
index 7f5d821..90f1839 100644
--- a/mmclab/mmc2json.m
+++ b/mmclab/mmc2json.m
@@ -13,9 +13,6 @@
 %         mcxpreview supports the cfg input for both mcxlab and mmclab.
 %    filestub: the filestub is the name stub for all output files,including
 %         filestub.json: the JSON input file
-%         filestub_vol.bin: the volume file if cfg.vol is defined
-%         filestub_shapes.json: the domain shape file if cfg.shapes is defined
-%         filestub_pattern.bin: the domain shape file if cfg.pattern is defined
 %
 %         if filestub ends with '.json', then mmc2json saves the mesh data
 %         in the single-file mode, and the mesh information is stored
@@ -58,16 +55,7 @@
     end
 end
 if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern))
-    Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1);
-    Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2);
-    Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3);
-    Optode.Source.Pattern.Data = single(cfg.srcpattern);
-    if (~singlefile)
-        Optode.Source.Pattern.Data = [filestub '_pattern.bin'];
-        fid = fopen(Optode.Source.Pattern.Data, 'wb');
-        fwrite(fid, cfg.srcpattern, 'float32');
-        fclose(fid);
-    end
+    Optode.Source.Pattern = single(cfg.srcpattern);
 end
 
 %% define the domain and optical properties
diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c
index 8dc0a1b..b98f30a 100644
--- a/src/mmc_cl_host.c
+++ b/src/mmc_cl_host.c
@@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
     cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata;     /*read-write buffers*/
     cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/
 
-    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
-    cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
+    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;    /**< total output data length in float count per time-frame */
+    cfg->crop0.w = meshlen * cfg->maxgate;    /**< total output data length, before double-buffer expansion */
 
     cl_float*  field, *dref = NULL;
 
@@ -107,6 +107,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
 
     GPUInfo* gpu = NULL;
     float4* propdet;
+    double energytot = 0.0, energyesc = 0.0;
 
     MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}},
         cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit,
@@ -124,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
         cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
         mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
-        cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
+        cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
         (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
         cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
     };
@@ -146,6 +147,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
 
     param.reclen = sharedmemsize / sizeof(float);
 
+    sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1);   /**< store energyesc/energytot */
+
     if (cfg->srctype == stPattern && cfg->srcnum > 1) {
         sharedmemsize += sizeof(cl_float) * cfg->srcnum;
     }
@@ -256,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN));
     }
 
-    fieldlen = meshlen * cfg->maxgate;
+    fieldlen = cfg->crop0.w;  /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */
 
     if (cfg->seed > 0) {
         srand(cfg->seed);
@@ -310,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), &param, &status), status)));
 
         Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN);
-        energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1);
+        energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1);
 
         for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) {
             Pseed[j] = rand();
@@ -331,7 +334,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
             OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status)));
         }
 
-        OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status)));
+        OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status)));
         OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status)));
         OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));
 
@@ -509,6 +512,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons,
                     cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin);
 
+        MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock);
+
         OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
@@ -556,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug));
     }
 
-    cfg->energytot = 0.f;
-    cfg->energyesc = 0.f;
+    cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
+    cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
+    energytot = 0.0;
+    energyesc = 0.0;
     cfg->runtime = 0;
 
     //simulate for all time-gates in maxgate groups per run
@@ -636,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
                 reporter.raytet += rep.raytet;
                 reporter.jumpdebug += rep.jumpdebug;
 
-                energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1);
-                OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1),
+                energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1);
+                OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum,
                                                 energy, 0, NULL, NULL)));
 
                 for (i = 0; i < gpu[devid].autothread; i++) {
-                    cfg->energyesc += energy[(i << 1)];
-                    cfg->energytot += energy[(i << 1) + 1];
+                    for (j = 0; j < cfg->srcnum; j++) {
+                        cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
+                        cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
+                        energyesc += energy[(i << 1) * cfg->srcnum + j];
+                        energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
+                    }
                 }
 
                 free(energy);
@@ -707,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify
                     OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen,
                                                     rawdref, 0, NULL, NULL)));
 
+                    //TODO: saving dref has not yet adopting double-buffer
                     for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU
                         dref[i] += rawdref[i];    //+rawfield[i+fieldlen];
                     }
@@ -724,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify
                     mcx_fflush(cfg->flog);
 
                     for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
-                        field[i] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
+                        field[i] += rawfield[i] + rawfield[i + fieldlen];
                     }
 
                     free(rawfield);
@@ -754,15 +766,20 @@ is more than what your have specified (%d), please use the -H option to specify
                 cfg->exportfield[i] += field[i];
             }
         } else {
-            for (i = 0; i < cfg->maxgate; i++)
+            int srcid;
+
+            for (i = 0; i < cfg->maxgate; i++) {
                 for (j = 0; j < mesh->ne; j++) {
-                    float ww = field[i * mesh->ne + j] * 0.25f;
-                    int k;
+                    for (srcid = 0; srcid < cfg->srcnum; srcid++) {
+                        float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
+                        int k;
 
-                    for (k = 0; k < mesh->elemlen; k++) {
-                        cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww;
+                        for (k = 0; k < mesh->elemlen; k++) {
+                            cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
+                        }
                     }
                 }
+            }
         }
     }
 
@@ -773,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify
     }
 
     if (cfg->isnormalized) {
-        MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
-        mcx_fflush(cfg->flog);
+        double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;
+
+        for (j = 0; j < cfg->srcnum; j++) {
+            energyabs =  cfg->energytot[j] - cfg->energyesc[j];
+            cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
+            sum_normalizer += cur_normalizer;
+            MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n",
+                                   j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
+        }
 
-        cfg->energyabs = cfg->energytot - cfg->energyesc;
-        mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
+        cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
     }
 
 #ifndef MCX_CONTAINER
@@ -817,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify
     MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n",
                 cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc);
     MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n",
-                cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
+                energytot, (energytot - energyesc) / energytot * 100.f);
     mcx_fflush(cfg->flog);
 
     OCL_ASSERT(clReleaseMemObject(gprogress[0]));
@@ -899,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify
     free(mcxkernel);
 
     free(waittoread);
+    free(cfg->energytot);
+    free(cfg->energyesc);
+    cfg->energytot = NULL;
+    cfg->energyesc = NULL;
 
     if (gpu) {
         free(gpu);
diff --git a/src/mmc_core.cl b/src/mmc_core.cl
index 2f95201..0bc30ba 100644
--- a/src/mmc_core.cl
+++ b/src/mmc_core.cl
@@ -47,6 +47,9 @@ inline __device__ __host__ int get_global_id(int idx) {
 inline __device__ __host__ int get_local_id(int idx) {
     return (idx == 0) ? threadIdx.x : ( (idx == 1) ? threadIdx.y : threadIdx.z );
 }
+inline __device__ __host__ int get_local_size(int idx) {
+    return (idx == 0) ? blockDim.x : ( (idx == 1) ? blockDim.y : blockDim.z );
+}
 inline __device__ __host__ float3 operator *(float3 a, float3 b) {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
@@ -480,7 +483,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float
 
 #endif
 
-#ifdef MCX_SAVE_DETECTORS
 __device__ void clearpath(__local float* p, int len) {
     uint i;
 
@@ -489,6 +491,7 @@ __device__ void clearpath(__local float* p, int len) {
     }
 }
 
+#ifdef MCX_SAVE_DETECTORS
 __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) {
     uint i;
 
@@ -1443,7 +1446,7 @@ __device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global floa
 
 __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXParam* gcfg, __global float3* node, __global int* elem, __global float* weight, __global float* dref,
                           __global int* type, __global int* facenb,  __global int* srcelem, __global float4* normal, __constant Medium* gmed,
-                          __global float* n_det, __global uint* detectedphoton, float* energytot, float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
+                          __global float* n_det, __global uint* detectedphoton, __local float* energytot, __local float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
                           __global float* replayweight, __global float* replaytime, __global RandType* photonseed, __global MCXReporter* reporter, __global float* gdebugdata) {
 
     int oldeid, fixcount = 0;
@@ -1464,12 +1467,10 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 
     /*initialize the photon parameters*/
     launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern);
-    *energytot += r.weight;
+
 #ifdef MCX_SAVE_DETECTORS
 
     if (GPU_PARAM(gcfg, issavedet)) {
-        clearpath(ppath, GPU_PARAM(gcfg, reclen));  /*clear shared memory for saving history of a new photon*/
-
         if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
             ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/
         } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
@@ -1479,9 +1480,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 
 #endif
 
-    if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) {
-        for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) {
+    if (GPU_PARAM(gcfg, srcnum) == 1) {
+        *energytot += r.weight;
+    } else {
+        for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
             ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid];
+            energytot[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
         }
     }
 
@@ -1700,7 +1704,9 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
         savedebugdata(&r, id, reporter, gdebugdata, gcfg);
     }
 
-    *energyesc += r.weight;
+    for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
+        energyesc[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
+    }
 }
 
 __kernel void mmc_main_loop(const int nphoton, const int ophoton,
@@ -1714,7 +1720,6 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
 
     RandType t[RAND_BUF_LEN];
     int idx = get_global_id(0);
-    float  energyesc = 0.f, energytot = 0.f;
     int raytet = 0;
 
 #ifdef __NVCC__
@@ -1725,6 +1730,9 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
         gpu_rng_init(t, n_seed, idx);
     }
 
+    clearpath(sharedmem, get_local_size(0) * ((GPU_PARAM(gcfg, srcnum) << 1) +
+              (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum))));
+
     /*launch photons*/
     for (int i = 0; i < nphoton + (idx < ophoton); i++) {
         if (GPU_PARAM(gcfg, seed) == SEED_FROM_FILE)
@@ -1732,14 +1740,17 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
                 t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j];
             }
 
-        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) *
-                  (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
-                  weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet,
+        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_size(0) * (GPU_PARAM(gcfg, srcnum) << 1) +
+                  get_local_id(0) * (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
+                  weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, sharedmem + get_local_id(0) * GPU_PARAM(gcfg, srcnum),
+                  sharedmem + (get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum), t, &raytet,
                   srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata);
     }
 
-    energy[idx << 1] = energyesc;
-    energy[1 + (idx << 1)] = energytot;
+    for (int i = 0; i < GPU_PARAM(gcfg, srcnum); i++) {
+        energy[(idx << 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[(get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum) + i];
+        energy[((idx << 1) + 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[get_local_id(0) * GPU_PARAM(gcfg, srcnum) + i];
+    }
 
     if (GPU_PARAM(gcfg, debuglevel) & MCX_DEBUG_PROGRESS && progress) {
         atomic_inc(progress);
diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu
index f14102d..d6290a9 100644
--- a/src/mmc_cu_host.cu
+++ b/src/mmc_cu_host.cu
@@ -238,6 +238,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     uint hostdetreclen = detreclen + 1;
     // launch mcxkernel
     size_t sharedmemsize = 0;
+    double energytot = 0.0, energyesc = 0.0;
 
     MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z),
                       make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z),
@@ -280,7 +281,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
                       cfg->bary0,
                       cfg->e0,
                       cfg->isextdet,
-                      (int)meshlen,
+                      (int)(meshlen / cfg->srcnum),
                       (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
                       (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
                       cfg->issaveseed,
@@ -296,6 +297,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
 
     param.reclen = sharedmemsize / sizeof(float);  //< the shared memory buffer length associated with detected photon
 
+    sharedmemsize += sizeof(float) * (cfg->srcnum << 1);   /**< store energyesc/energytot */
+
     if (cfg->srctype == stPattern && cfg->srcnum > 1) {
         sharedmemsize += sizeof(float) * cfg->srcnum;
     }
@@ -332,8 +335,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
             cfg->exportseed = (unsigned char*)malloc(cfg->maxdetphoton * (sizeof(RandType) * RAND_BUF_LEN));
         }
 
-        cfg->energytot = 0.f;
-        cfg->energyesc = 0.f;
+        cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
+        cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
         cfg->runtime = 0;
     }
     #pragma omp barrier
@@ -472,7 +475,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     *progress = 0;
 
     Pseed = (uint*)malloc(sizeof(uint) * gpu[gpuid].autothread * RAND_SEED_WORD_LEN);
-    energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1);
+    energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1);
 
     for (j = 0; j < gpu[gpuid].autothread * RAND_SEED_WORD_LEN; j++) {
         Pseed[j] = rand();
@@ -499,9 +502,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
                            cudaMemcpyHostToDevice));
 
     CUDA_ASSERT(cudaMalloc((void**)&genergy,
-                           sizeof(float) * (gpu[gpuid].autothread << 1)));
+                           sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum));
     CUDA_ASSERT(cudaMemcpy(genergy, energy,
-                           sizeof(float) * (gpu[gpuid].autothread << 1),
+                           sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum,
                            cudaMemcpyHostToDevice));
 
     CUDA_ASSERT(cudaMalloc((void**)&gdetected, sizeof(uint)));
@@ -591,6 +594,9 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
         MMC_FPRINTF(cfg->flog,
                     "lauching mcx_main_loop for time window [%.1fns %.1fns] ...\n",
                     twindow0 * 1e9, twindow1 * 1e9);
+
+        MMC_FPRINTF(cfg->flog, "requesting %ld bytes of shared memory\n", sharedmemsize);
+
         mcx_fflush(cfg->flog);
 
         // total number of repetition for the simulations, results will be
@@ -650,16 +656,21 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
             reporter.raytet += rep.raytet;
             reporter.jumpdebug += rep.jumpdebug;
 
-            energy = (float*)calloc(sizeof(float), gpu[gpuid].autothread << 1);
+            energy = (float*)calloc(sizeof(float) * cfg->srcnum, gpu[gpuid].autothread << 1);
 
             CUDA_ASSERT(cudaMemcpy(energy, genergy,
-                                   sizeof(float) * (gpu[gpuid].autothread << 1),
+                                   sizeof(float) * (gpu[gpuid].autothread << 1) * cfg->srcnum,
                                    cudaMemcpyDeviceToHost));
             #pragma omp critical
             {
+
                 for (i = 0; i < gpu[gpuid].autothread; i++) {
-                    cfg->energyesc += energy[(i << 1)];
-                    cfg->energytot += energy[(i << 1) + 1];
+                    for (j = 0; j < (uint) cfg->srcnum; j++) {
+                        cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
+                        cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
+                        energyesc += energy[(i << 1) * cfg->srcnum + j];
+                        energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
+                    }
                 }
             }
 
@@ -784,7 +795,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
 
     #pragma omp master
     {
-        int i, j;
+        int i, j, srcid;
 
         if (cfg->exportfield) {
             if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
@@ -792,14 +803,18 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
                     #pragma omp atomic
                     cfg->exportfield[i] += field[i];
             } else {
-                for (i = 0; i < cfg->maxgate; i++)
+                for (i = 0; i < cfg->maxgate; i++) {
                     for (j = 0; j < mesh->ne; j++) {
-                        float ww = field[i * mesh->ne + j] * 0.25f;
+                        for (srcid = 0; srcid < cfg->srcnum; srcid++) {
+                            float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
+                            int k;
 
-                        for (int k = 0; k < mesh->elemlen; k++)
-                            cfg->exportfield[i * mesh->nn +
-                                               mesh->elem[j * mesh->elemlen + k] - 1] += ww;
+                            for (k = 0; k < mesh->elemlen; k++) {
+                                cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
+                            }
+                        }
                     }
+                }
             }
         }
 
@@ -810,11 +825,17 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
         }
 
         if (cfg->isnormalized) {
-            MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
-            mcx_fflush(cfg->flog);
+            double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;
+
+            for (j = 0; j < cfg->srcnum; j++) {
+                energyabs =  cfg->energytot[j] - cfg->energyesc[j];
+                cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
+                sum_normalizer += cur_normalizer;
+                MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: " S_BOLD "" S_BLUE "%5.5f%%" S_RESET "\tnormalizor=%g\n",
+                                       j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
+            }
 
-            cfg->energyabs = cfg->energytot - cfg->energyesc;
-            mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
+            cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
         }
 
 #ifndef MCX_CONTAINER
@@ -868,8 +889,8 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
         MMC_FPRINTF(cfg->flog,
                     "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to "
                     "initial specular reflection is excluded in the total)\n",
-                    cfg->energytot,
-                    (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
+                    energytot,
+                    (energytot - energyesc) / energytot * 100.f);
         mcx_fflush(cfg->flog);
     }
     #pragma omp barrier
@@ -913,9 +934,15 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
     CUDA_ASSERT(cudaFree(greporter));
 
     #pragma omp master
+    {
+        if (gpu) {
+            free(gpu);
+        }
 
-    if (gpu) {
-        free(gpu);
+        free(cfg->energytot);
+        free(cfg->energyesc);
+        cfg->energytot = NULL;
+        cfg->energyesc = NULL;
     }
 
     free(field);
diff --git a/src/mmc_utils.c b/src/mmc_utils.c
index 404a149..e87fb09 100644
--- a/src/mmc_utils.c
+++ b/src/mmc_utils.c
@@ -321,9 +321,8 @@ void mcx_initcfg(mcconfig* cfg) {
     cfg->exportdetected = NULL;
     cfg->exportseed = NULL;
     cfg->detectedcount = 0;
-    cfg->energytot = 0.f;
-    cfg->energyabs = 0.f;
-    cfg->energyesc = 0.f;
+    cfg->energytot = NULL;
+    cfg->energyesc = NULL;
     cfg->runtime = 0;
     cfg->autopilot = 1;
     cfg->gpuid = 0;
@@ -421,6 +420,14 @@ void mcx_clearcfg(mcconfig* cfg) {
         free(cfg->roidata);
     }
 
+    if (cfg->energytot) {
+        free(cfg->energytot);
+    }
+
+    if (cfg->energyesc) {
+        free(cfg->energyesc);
+    }
+
 #ifndef MCX_EMBED_CL
 
     if (cfg->clsource && cfg->clsource != (char*)mmc_core_cl) {
@@ -1582,6 +1589,8 @@ int mcx_loadjson(cJSON* root, mcconfig* cfg) {
                     if (ndim == 3 && dims[2] > 1 && dims[0] > 1 && cfg->srctype == MCX_SRC_PATTERN) {
                         cfg->srcnum = dims[0];
                     }
+
+                    mcx_convertrow2col(cfg->srcpattern, (uint3*)dims);
                 } else {
                     int nx = FIND_JSON_KEY("Nx", "Optode.Source.Pattern.Nx", subitem, 0, valueint);
                     int ny = FIND_JSON_KEY("Ny", "Optode.Source.Pattern.Ny", subitem, 0, valueint);
@@ -2776,6 +2785,35 @@ int mcx_isbinstr(const char* str) {
     return 1;
 }
 
+/**
+ * @brief Convert a row-major (C/C++) array to a column-major (MATLAB/FORTRAN) array
+ *
+ * @param[in,out] vol: a 3D array (wrapped in 1D) to be converted
+ * @param[in] dim: the dimensions of the 3D array
+ */
+
+void  mcx_convertrow2col(float* vol, uint3* dim) {
+    uint x, y, z;
+    unsigned int dimxy, dimyz;
+    float* newvol = NULL;
+
+    if (vol == NULL || dim->x == 0 || dim->y == 0 || dim->z == 0) {
+        return;
+    }
+
+    newvol = (float*)malloc(sizeof(float) * dim->x * dim->y * dim->z);
+    dimxy = dim->x * dim->y;
+    dimyz = dim->y * dim->z;
+
+    for (x = 0; x < dim->x; x++)
+        for (y = 0; y < dim->y; y++)
+            for (z = 0; z < dim->z; z++) {
+                newvol[z * dimxy + y * dim->x + x] = vol[x * dimyz + y * dim->z + z];
+            }
+
+    memcpy(vol, newvol, sizeof(float) * dim->x * dim->y * dim->z);
+    free(newvol);
+}
 
 /**
  * @brief Validate all input fields, and warn incompatible inputs
diff --git a/src/mmc_utils.h b/src/mmc_utils.h
index 0e0487a..ffd7aa2 100644
--- a/src/mmc_utils.h
+++ b/src/mmc_utils.h
@@ -276,7 +276,8 @@ typedef struct MMC_config {
     unsigned char* exportseed;     /*memory buffer when returning the RNG seed to matlab*/
     float* exportdetected;         /*memory buffer when returning the partial length info to external programs such as matlab*/
     float* exportdebugdata;        /**<pointer to the buffer where the photon trajectory data are stored*/
-    double energytot, energyabs, energyesc;
+    double* energytot;             /**<total energy launched for each source, a buffer of length srcnum */
+    double* energyesc;             /**<total energy escaped for each source, a buffer of length srcnum */
     unsigned int detectedcount;    /**<total number of detected photons*/
     unsigned int runtime;          /**<total simulation runtime in ms*/
     char autopilot;                /**<1 optimal setting for dedicated card, 2, for non dedicated card*/
@@ -333,6 +334,7 @@ void mcx_convertcol2row4d(unsigned int** vol, uint4* dim);
 void mcx_savejdata(char* filename, mcconfig* cfg);
 int  mcx_jdataencode(void* vol,  int ndim, uint* dims, char* type, int byte, int zipid, void* obj, int isubj, int iscol, mcconfig* cfg);
 int  mcx_jdatadecode(void** vol, int* ndim, uint* dims, int maxdim, char** type, cJSON* obj, mcconfig* cfg);
+void mcx_convertrow2col(float* vol, uint3* dim);
 void mcx_savejnii(OutputType* vol, int ndim, uint* dims, float* voxelsize, char* name, int isfloat, int iscol, mcconfig* cfg);
 void mcx_savebnii(OutputType* vol, int ndim, uint* dims, float* voxelsize, char* name, int isfloat, int iscol, mcconfig* cfg);
 void mcx_savejdet(float* ppath, void* seeds, uint count, int doappend, mcconfig* cfg);