Skip to content

Commit

Permalink
Merge pull request #142 from bragadeesh/develop
Browse files Browse the repository at this point in the history
Add radix 11/13 to support powers of 11/13 transform sizes
  • Loading branch information
bragadeesh committed Apr 13, 2016
2 parents 4e67415 + 237f0d0 commit b7da307
Show file tree
Hide file tree
Showing 14 changed files with 1,142 additions and 278 deletions.
5 changes: 1 addition & 4 deletions src/library/action.transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
//std::cout << programCode << std::endl;//TIMMY
}
else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
Expand All @@ -247,7 +246,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
//std::cout << programCode << std::endl;//TIMMY
}
else
{
Expand Down Expand Up @@ -283,7 +281,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
*/
//general swap kernel takes care of all ratio
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
//std::cout << programCode << std::endl;//TIMMY
}

cl_int status = CL_SUCCESS;
Expand Down Expand Up @@ -720,7 +717,7 @@ clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo,
{
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
}
//std::cout << programCode << std::endl;//TIMMY

return CLFFT_SUCCESS;
}

Expand Down
139 changes: 109 additions & 30 deletions src/library/generator.stockham.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ namespace StockhamGenerator
return;
}

size_t baseRadix[] = {7,5,3,2}; // list only supported primes
size_t baseRadix[] = {13,11,7,5,3,2}; // list only supported primes
size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);

size_t l = length;
Expand Down Expand Up @@ -437,7 +437,19 @@ namespace StockhamGenerator
{
workGroupSize = 49;
numTrans = length >= 7*workGroupSize ? 1 : (7*workGroupSize)/length;
} else {
}
else if (primeFactorsExpanded[11] == length) // Length is pure power of 11
{
workGroupSize = 121;
numTrans = length >= 11 * workGroupSize ? 1 : (11 * workGroupSize) / length;
}
else if (primeFactorsExpanded[13] == length) // Length is pure power of 13
{
workGroupSize = 169;
numTrans = length >= 13 * workGroupSize ? 1 : (13 * workGroupSize) / length;
}
else
{
size_t leastNumPerWI = 1; // least number of elements in one work item
size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired

Expand Down Expand Up @@ -470,7 +482,14 @@ namespace StockhamGenerator
leastNumPerWI = 70; maxWorkGroupSize = 36;
} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
leastNumPerWI =105; maxWorkGroupSize = 24;
} else {
}
else if (primeFactorsExpanded[2] * primeFactorsExpanded[11] == length) {
leastNumPerWI = 22; maxWorkGroupSize = 128;
}
else if (primeFactorsExpanded[2] * primeFactorsExpanded[13] == length) {
leastNumPerWI = 26; maxWorkGroupSize = 128;
}
else {
leastNumPerWI =210; maxWorkGroupSize = 12;
}
if (pr==P_DOUBLE)
Expand Down Expand Up @@ -2025,7 +2044,7 @@ namespace StockhamGenerator
fft_postCallback = postcallbackParam;
}

void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle, bool twiddleFront,
bool inInterleaved, bool outInterleaved,
bool inReal, bool outReal,
size_t inStride, size_t outStride, double scale,
Expand Down Expand Up @@ -2495,7 +2514,7 @@ namespace StockhamGenerator

// 3-step twiddle multiplies done in the front
bool tw3Done = false;
if(fft_3StepTwiddle && (position == 0))
if(fft_3StepTwiddle && twiddleFront)
{
tw3Done = true;
if(linearRegs)
Expand Down Expand Up @@ -3019,7 +3038,7 @@ namespace StockhamGenerator
else
{
// Possible radices
size_t cRad[] = {10,8,7,6,5,4,3,2,1}; // Must be in descending order
size_t cRad[] = {13,11,10,8,7,6,5,4,3,2,1}; // Must be in descending order
size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));

// Generate the radix and pass objects
Expand Down Expand Up @@ -3233,32 +3252,12 @@ namespace StockhamGenerator

std::string sfx = FloatSuffix<PR>();

// Base type
str += "#define fptype "; str += RegBaseType<PR>(1); str += "\n\n";

// Vector type
str += "#define fvect2 "; str += RegBaseType<PR>(2); str += "\n\n";

//constants
str += "#define C8Q 0.70710678118654752440084436210485"; str += sfx; str += "\n";

str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";

str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";

str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
str += "#define C7Q2 0.79015646852540022404554065360571"; str += sfx; str += "\n";
str += "#define C7Q3 0.05585426728964774240049351305970"; str += sfx; str += "\n";
str += "#define C7Q4 0.73430220123575240531721419756650"; str += sfx; str += "\n";
str += "#define C7Q5 0.44095855184409837868031445395900"; str += sfx; str += "\n";
str += "#define C7Q6 0.34087293062393136944265847887436"; str += sfx; str += "\n";
str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
str += "#define C7Q8 0.87484229096165666561546458979137"; str += sfx; str += "\n";

str += "\n";

bool cReg = linearRegs ? true : false;

// Generate butterflies for all unique radices
Expand All @@ -3269,6 +3268,86 @@ namespace StockhamGenerator
uradices.sort();
uradices.unique();


//constants
if (length%8 == 0)
{
str += "#define C8Q 0.70710678118654752440084436210485"; str += sfx; str += "\n";
}

if (length % 5 == 0)
{
str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
}

if (length % 3 == 0)
{
str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
}

if (length % 7 == 0)
{
str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
str += "#define C7Q2 0.79015646852540022404554065360571"; str += sfx; str += "\n";
str += "#define C7Q3 0.05585426728964774240049351305970"; str += sfx; str += "\n";
str += "#define C7Q4 0.73430220123575240531721419756650"; str += sfx; str += "\n";
str += "#define C7Q5 0.44095855184409837868031445395900"; str += sfx; str += "\n";
str += "#define C7Q6 0.34087293062393136944265847887436"; str += sfx; str += "\n";
str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
str += "#define C7Q8 0.87484229096165666561546458979137"; str += sfx; str += "\n";
}

if (length % 11 == 0)
{
str += "#define b11_0 0.9898214418809327"; str += sfx; str += "\n";
str += "#define b11_1 0.9594929736144973"; str += sfx; str += "\n";
str += "#define b11_2 0.9189859472289947"; str += sfx; str += "\n";
str += "#define b11_3 0.8767688310025893"; str += sfx; str += "\n";
str += "#define b11_4 0.8308300260037728"; str += sfx; str += "\n";
str += "#define b11_5 0.7784344533346518"; str += sfx; str += "\n";
str += "#define b11_6 0.7153703234534297"; str += sfx; str += "\n";
str += "#define b11_7 0.6343562706824244"; str += sfx; str += "\n";
str += "#define b11_8 0.3425847256816375"; str += sfx; str += "\n";
str += "#define b11_9 0.5211085581132027"; str += sfx; str += "\n";
}

if (length % 13 == 0)
{
str += "#define b13_0 0.9682872443619840"; str += sfx; str += "\n";
str += "#define b13_1 0.9578059925946651"; str += sfx; str += "\n";
str += "#define b13_2 0.8755023024091479"; str += sfx; str += "\n";
str += "#define b13_3 0.8660254037844386"; str += sfx; str += "\n";
str += "#define b13_4 0.8595425350987748"; str += sfx; str += "\n";
str += "#define b13_5 0.8534800018598239"; str += sfx; str += "\n";
str += "#define b13_6 0.7693388175729806"; str += sfx; str += "\n";
str += "#define b13_7 0.6865583707817543"; str += sfx; str += "\n";
str += "#define b13_8 0.6122646503767565"; str += sfx; str += "\n";
str += "#define b13_9 0.6004772719326652"; str += sfx; str += "\n";
str += "#define b13_10 0.5817047785105157"; str += sfx; str += "\n";
str += "#define b13_11 0.5751407294740031"; str += sfx; str += "\n";
str += "#define b13_12 0.5220263851612750"; str += sfx; str += "\n";
str += "#define b13_13 0.5200285718888646"; str += sfx; str += "\n";
str += "#define b13_14 0.5165207806234897"; str += sfx; str += "\n";
str += "#define b13_15 0.5149187780863157"; str += sfx; str += "\n";
str += "#define b13_16 0.5035370328637666"; str += sfx; str += "\n";
str += "#define b13_17 0.5000000000000000"; str += sfx; str += "\n";
str += "#define b13_18 0.3027756377319946"; str += sfx; str += "\n";
str += "#define b13_19 0.3014792600477098"; str += sfx; str += "\n";
str += "#define b13_20 0.3004626062886657"; str += sfx; str += "\n";
str += "#define b13_21 0.2517685164318833"; str += sfx; str += "\n";
str += "#define b13_22 0.2261094450357824"; str += sfx; str += "\n";
str += "#define b13_23 0.0833333333333333"; str += sfx; str += "\n";
str += "#define b13_24 0.0386329546443481"; str += sfx; str += "\n";
}

str += "\n";


//If pre-callback is set for the plan
std::string callbackstr;
if (params.fft_hasPreCallback)
Expand Down Expand Up @@ -3351,7 +3430,7 @@ namespace StockhamGenerator
if((p+1) != passes.end()) { outIlvd = ldsInterleaved; }
}

p->GeneratePass(fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
p->GeneratePass(fwd, str, tw3Step, params.fft_twiddleFront, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
}

// if real transform we do only 1 direction
Expand Down
Loading

0 comments on commit b7da307

Please sign in to comment.