
#include "kernel.cuh"

#define PI 3.14159

static bool messageprint = false;

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}



__global__ void PlanematrixMultiplicationKernel(float* c, float* c_im, float* a, float* b, float* a_im, int freqbinanalysis, int tfheight,int freqstart,int legendresamples,float* dev_kr_matrix)
{
    //yD_y +
    // Length of the Legendre kernel dimension
    int COL = blockIdx.y * blockDim.y + threadIdx.y;
    // Frequency index
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    // Audio Block Size
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

    float tmpSum = 0;
    float tmpSum_im = 0;

    

    if (ROW < freqbinanalysis && COL < legendresamples) {
       
        int kr = (int)dev_kr_matrix[ROW + freqstart];

        for (int i = 0; i < legendresamples; i++)
        {
            tmpSum += a[PLANE_OFFSET * freqbinanalysis * legendresamples + (ROW+ freqstart) * legendresamples + i] * b[kr * legendresamples * legendresamples + COL * legendresamples + i];
            tmpSum_im += a_im[PLANE_OFFSET * freqbinanalysis * legendresamples + (ROW + freqstart) * legendresamples + i] * b[kr * legendresamples * legendresamples + COL * legendresamples + i];
        }

       c[PLANE_OFFSET * freqbinanalysis * legendresamples + (ROW) *legendresamples + COL] = tmpSum;
       c_im[PLANE_OFFSET * freqbinanalysis * legendresamples + (ROW) *legendresamples + COL] = tmpSum_im;
    }



}
__global__ void PlanecomplexmatrixMultiplicationKernel(float* c, float* c_im, float* a, float* b, float* a_im, float* b_im, unsigned int micsize, unsigned int tfheight, int noffrequencybins,int freqstart,int legendresamples, float* dev_kr_matrix)
{
    //For Legendre samples 
    int COL = blockIdx.y * blockDim.y + threadIdx.y;
    //For frequency axes
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    //For audio blocks
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

    float tmpSum = 0;
    float tmpSum_im = 0;
                    

    if (ROW < noffrequencybins && COL < legendresamples) {

        int kr = (int)dev_kr_matrix[ROW];

        for (int i = 0; i < micsize; i++)
        {
            tmpSum += a[PLANE_OFFSET * micsize * tfheight + ROW * micsize + i] * b[kr * legendresamples * micsize + COL * micsize + i] - a_im[PLANE_OFFSET * micsize * tfheight + ROW * micsize + i] * b_im[kr * legendresamples * micsize + COL * micsize + i];
            tmpSum_im += a[PLANE_OFFSET * micsize * tfheight + ROW * micsize + i] * b_im[kr * legendresamples * micsize + COL * micsize + i] + a_im[PLANE_OFFSET * micsize * tfheight + ROW * micsize + i] * b[kr * legendresamples * micsize + COL * micsize + i];
        }
    }
        c[PLANE_OFFSET * noffrequencybins * legendresamples + (ROW) *legendresamples + COL] = tmpSum;
        c_im[PLANE_OFFSET * noffrequencybins * legendresamples + (ROW) *legendresamples + COL] = tmpSum_im;
    //}

}

__global__ void ResidualcomplexmatrixMultiplicationKernel(float* c, float* c_im, float* a, float* b, float* a_im, float* hist,int legendresamples,int freqnumber, float* dev_kr_matrix)
{
    //yD_y + 
    int COL = blockIdx.y * blockDim.y + threadIdx.y;
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

    float tmpSum = 0;
    float tmpSum_im = 0;

    if (ROW < freqnumber && COL < legendresamples) {

        int kr = (int)dev_kr_matrix[ROW];

        for (int i = 0; i < legendresamples; i++)
        {
            int index = (int)hist[PLANE_OFFSET * freqnumber + ROW];
            tmpSum += a[PLANE_OFFSET * legendresamples * freqnumber + ROW * legendresamples + i] * b[kr * legendresamples * legendresamples * legendresamples +index * legendresamples * legendresamples + i * legendresamples + COL] ;
            tmpSum_im += a_im[PLANE_OFFSET * legendresamples * freqnumber + ROW * legendresamples + i] * b[kr * legendresamples * legendresamples * legendresamples + index * legendresamples * legendresamples + i * legendresamples + COL];
        }
    }
// Complex coefficients of the legendre kernel
    c[PLANE_OFFSET * legendresamples * freqnumber + ROW * legendresamples + COL] = tmpSum;
    c_im[PLANE_OFFSET * legendresamples * freqnumber + ROW * legendresamples + COL] = tmpSum_im;
}

__global__ void MagcomplexmatrixMultiplicationKern(float* c, float* c_im, float* a, float* b, float* a_im, float* hist, int noffrequency,int legendresamples, float* dev_kr_matrix)
{
    //yD_y + 
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

    float tmpSum = 0;
    float tmpSum_im = 0;

    if (ROW < noffrequency) {

        int kr = (int)dev_kr_matrix[ROW];
        int index = (int)hist[PLANE_OFFSET * noffrequency + ROW];

        for (int i = 0; i < legendresamples; i++)
        {
            tmpSum += a[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + i] * b[kr * legendresamples * legendresamples + index * legendresamples + i];
            tmpSum_im += a_im[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + i] * b[kr * legendresamples * legendresamples + index * legendresamples + i];
        }
    }
    // Complex coefficients of the legendre kernel
    c[PLANE_OFFSET * noffrequency + ROW] = tmpSum;
    c_im[PLANE_OFFSET * noffrequency + ROW] = tmpSum_im;
}

__global__ void SwapMatricesKernel(float* c, float* c_im, float* a, float* a_im, int noffrequency,int legendresamples)
{
   // For legendre samples 
   int COL = blockIdx.y * blockDim.y + threadIdx.y;
   // For Frequency indices
   int ROW = blockIdx.x * blockDim.x + threadIdx.x;
   // For Block Size
   int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

   //float tmpSum = 0;
   //float tmpSum_im = 0;

   if (ROW < noffrequency && COL < legendresamples) {

       c[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + COL] = a[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + COL];
       c_im[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + COL] = a_im[PLANE_OFFSET * noffrequency * legendresamples + ROW * legendresamples + COL];
   }

}

__global__ void CalculateOMPSources(float* dev_result, float* dev_result_im, float* dev_prevsep, float* dev_prevsep_im, float* dev_multiplication_kernel, float* dev_multiplication_kernel_imag, float* dev_magnitudes, float* dev_magnitudes_im, float* dev_general_histogram, int BlockSize, int legendresamples, int NofFrequencyBins, int iterationcount, float* doaestimate, int nofsources, int prev_nofsources,float* dev_pressurelevel,float* dev_krmatrix)
{
    //yD_y + 
    // For Number of sources
    int SID = (blockIdx.y * blockDim.y + threadIdx.y);
    // For Frequency indices
    int ROW = (blockIdx.x * blockDim.x + threadIdx.x);
    // For Plane Offset
    int PLANE_OFFSET = (blockIdx.z * blockDim.z + threadIdx.z);

    float tmp_sum = 0.0;// 0.0010;
    float tmp_sum_im = 0.0; //  0.0010;

   if (ROW < NofFrequencyBins) {



       int kr = (int)dev_krmatrix[ROW];


        for (int ijk = 0; ijk < iterationcount; ijk++)
        {
            int id = ijk * BlockSize * NofFrequencyBins  + PLANE_OFFSET * NofFrequencyBins + ROW;
            int number = dev_general_histogram[id] ;
            if (number < legendresamples)
            {
                tmp_sum += dev_magnitudes[id] * dev_multiplication_kernel[kr * legendresamples * legendresamples + (int)doaestimate[SID] * legendresamples + number] - dev_magnitudes_im[id] * dev_multiplication_kernel_imag[kr * legendresamples * legendresamples + (int)doaestimate[SID] * legendresamples + number];
                tmp_sum_im += dev_magnitudes_im[id] * dev_multiplication_kernel[kr * legendresamples * legendresamples + (int)doaestimate[SID] * legendresamples + number] + dev_magnitudes[id] * dev_multiplication_kernel_imag[kr * legendresamples * legendresamples + (int)doaestimate[SID] * legendresamples + number];
            }
        }
       // dev_pressurelevel[SID] += sqrt(dev_result[PLANE_OFFSET * WindowLength * nofsources + ROW * nofsources + SID] * dev_result[PLANE_OFFSET * WindowLength * nofsources + ROW * nofsources + SID] + dev_result_im[PLANE_OFFSET * WindowLength * nofsources + ROW * nofsources + SID] * dev_result_im[PLANE_OFFSET * WindowLength * nofsources + ROW * nofsources + SID]);
    }
   if (prev_nofsources > SID)
   {
       dev_result[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID] = tmp_sum / (float)legendresamples; // +0.7 * dev_prevsep[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID];
       dev_result_im[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID] = tmp_sum_im / (float)legendresamples; // +0.7 * dev_prevsep_im[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID];
   }
   else
   {
       dev_result[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID] = tmp_sum / (float)legendresamples;
       dev_result_im[PLANE_OFFSET * NofFrequencyBins * nofsources + ROW * nofsources + SID] = tmp_sum_im / (float)legendresamples;
   }

}

__global__ void MultiplcationMatrixCalculation(float* dev_multiplication_kernel, float* dev_general_histogram, float* dev_kernels_vonmises, float* dev_kernels, int PlaneDim, int N, int iteration)
{
    //yD_y + 
    int ROW = blockIdx.y * blockDim.y + threadIdx.y;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

    //float tmpSum = 0;
    //float tmpSum_im = 0;
    int index;
    float sum;
    if (ROW < N ) {
        for (int kl = 0; kl < iteration; kl++)
        {
            index = dev_general_histogram[kl * (PlaneDim* N) + PLANE_OFFSET * N + ROW];
            sum = 0.0;

            for (int klj = 0; klj < N; klj++)
            {
                sum += dev_kernels_vonmises[klj] * dev_kernels[index * N + klj];
            }

            dev_multiplication_kernel[kl * (PlaneDim * N) + PLANE_OFFSET * N + ROW] = sum;
        }
    }
}
    
__global__ void SwapMatricesKernel2D(float* c, float* c_im, float* a, float* a_im, int N)
{
    //yD_y + 
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;

   // float tmpSum = 0;
   // float tmpSum_im = 0;

    if (ROW < N) {

        c[PLANE_OFFSET * N + ROW] = a[PLANE_OFFSET * N + ROW];
        c_im[PLANE_OFFSET * N + ROW] = a_im[PLANE_OFFSET * N + ROW];
    }

}


__global__ void PlaneMaxValuedKernel(float* c_next, float* c_next_im, float* a, int freqanalysis, int legendresamples)
{
    //yD_y + 
    //int COL = blockIdx.y * blockDim.y + threadIdx.y;
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;


    if (ROW < freqanalysis) {

        int offset = 0;
        float maxval = sqrt(c_next[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples]* c_next[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples] + c_next_im[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples] * c_next_im[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples]);
        for (int i = 1; i < legendresamples; i++)
        {
            float val = sqrt(c_next[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples + i] * c_next[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples + i] + c_next_im[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples + i] * c_next_im[PLANE_OFFSET * freqanalysis * legendresamples + ROW * legendresamples + i]);
            if (val > maxval)
            {
                maxval = val;
                offset = i;
            }
        }
        a[PLANE_OFFSET* freqanalysis + ROW] = offset;
    }

}



__global__ void RENTHistKernel(float* c, float* c_im, float* a, float* a_im, float* hist, float* RENThist, int legendresamples,int noffreqbins)
{

    //yD_y + 
   // int COL = blockIdx.y * blockDim.y + threadIdx.y;
    int ROW = blockIdx.x * blockDim.x + threadIdx.x;
    int PLANE_OFFSET = blockIdx.z * blockDim.z + threadIdx.z;


    float tmpSum_SRF = 0;
    float tmpSum_Res = 0;
    //int index;
    if (ROW < noffreqbins) {

         for (int i = 0; i < legendresamples; i++)
         {
             tmpSum_Res += a[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] * a[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] + a_im[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] * a_im[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i];
             tmpSum_SRF += c[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] * c[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] + c_im[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i] * c_im[PLANE_OFFSET * legendresamples * noffreqbins + ROW * legendresamples + i];
         }
   
         float ratio = sqrt(tmpSum_Res) / sqrt(tmpSum_SRF);

         RENThist[PLANE_OFFSET * noffreqbins + ROW] = ratio;
    }


}

// According to the magnitudes of the signal calculate RENT histogram
cudaError_t RENTHistogramKernel(float* c, float* c_im, float* a, float* a_im, float* hist, float* RENThist, unsigned int legendresamples, unsigned int BlockSize, unsigned int noffreqbins)
{

    cudaError_t cudaStatus;

    dim3 threadsPerBlock(noffreqbins, 1, BlockSize);
    dim3 blocksPerGrid(1, 1, 1);

    if (noffreqbins > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = noffreqbins / threadsPerBlock.x;
    }

    threadsPerBlock.y = 1;
    blocksPerGrid.y = 1;

    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = BlockSize / threadsPerBlock.z;
    }

    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    RENTHistKernel << <blocksPerGrid, threadsPerBlock >> > (c, c_im, a, a_im, hist, RENThist, legendresamples, noffreqbins);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
    if(messageprint)
        printf("RENTHistogram(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;


}

//inp->FrequencybinAnalysis, inp->NofAudioBlocks, inp->NofAudioSamplesInBlock, inp->FrequencyStartAnalysis, inp->NofLegendreSamplingPoints
// Device Helper function for using CUDA to subtract vectors in parallel.
cudaError_t Device_DmultiplyWithCuda(float* dev_c, float* dev_c_im, float* dev_a, float* dev_b, float* dev_a_im, unsigned int freqbinanalysis, unsigned int BlockSize,unsigned int tfheight, unsigned int freqstart, unsigned int legendresamples, float* dev_kr_matrix)
{
    cudaError_t cudaStatus;


    dim3 threadsPerBlock(freqbinanalysis, 1, BlockSize);
    dim3 blocksPerGrid(1, 1, 1);
    std::chrono::steady_clock::time_point begin, end;

    if (freqbinanalysis > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(freqbinanalysis) / double(threadsPerBlock.x));
    }

    if (legendresamples > 16)
    {
        threadsPerBlock.y = 16;
        blocksPerGrid.y = ceil(double(legendresamples) / double(threadsPerBlock.y));
    }

    if (BlockSize > 1 )
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    PlanematrixMultiplicationKernel << <blocksPerGrid, threadsPerBlock >> > (dev_c, dev_c_im, dev_a, dev_b, dev_a_im, freqbinanalysis, tfheight, freqstart, legendresamples, dev_kr_matrix);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("Multiplication(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;
}

// Device Helper function for using CUDA to subtract vectors in parallel.
// inp->NofChannels, inp->NofAudioSamplesInBlock, inp->FrequencybinAnalysis, inp->NofAudioBlocks, inp->FrequencyStartAnalysis, inp->NofLegendreSamplingPoints
cudaError_t Device_DComplexmultiplyWithCuda(float* dev_c, float* dev_c_im, float* dev_a, float* dev_b, float* dev_a_im, float* dev_b_im, unsigned int micsize, unsigned int tfheight, unsigned int frequencybinanalysis, unsigned int BlockSize, unsigned int freqstart, unsigned int legendresamples, float* dev_kr_matrix)
{
    cudaError_t cudaStatus;


    dim3 threadsPerBlock(frequencybinanalysis, legendresamples, BlockSize);
    dim3 blocksPerGrid(1, 1, 1);
    std::chrono::steady_clock::time_point begin, end;

    if (frequencybinanalysis > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(frequencybinanalysis) / double(threadsPerBlock.x));
    }

    if (legendresamples > 16)
    {
        threadsPerBlock.y = 16;
        blocksPerGrid.y = ceil(double(legendresamples) / double(threadsPerBlock.y));
    }

    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    PlanecomplexmatrixMultiplicationKernel << <blocksPerGrid, threadsPerBlock >> > (dev_c, dev_c_im, dev_a, dev_b, dev_a_im, dev_b_im, micsize, tfheight, frequencybinanalysis, freqstart, legendresamples, dev_kr_matrix);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("ComplexMultiplication(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;
}

// Device Helper function for using CUDA to subtract vectors in parallel.
cudaError_t RescomplexmatrixMultiplicationKernel(float* dev_c, float* dev_c_im, float* dev_a, float* dev_b, float* dev_a_im, float* hist, unsigned int freqnumber, unsigned int legendresamples, unsigned int BlockSize, float* dev_kr_matrix)
{
    cudaError_t cudaStatus;


    dim3 threadsPerBlock(freqnumber, legendresamples, BlockSize);
    dim3 blocksPerGrid(1, 1, 1);

    if (freqnumber > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(freqnumber) / double(threadsPerBlock.x));
    }

    if (legendresamples > 16)
    {
        threadsPerBlock.y = 16;
        blocksPerGrid.y = ceil(double(legendresamples) / double(threadsPerBlock.y));
    }

    if (BlockSize >1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }
    std::chrono::steady_clock::time_point begin, end;

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    ResidualcomplexmatrixMultiplicationKernel << <blocksPerGrid, threadsPerBlock >> > (dev_c, dev_c_im, dev_a, dev_b, dev_a_im, hist, legendresamples, freqnumber, dev_kr_matrix);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("ResidualVector(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;
}


// Device Helper function for using CUDA to subtract vectors in parallel.
cudaError_t MagcomplexmatrixMultiplicationKernel(float* dev_c, float* dev_c_im, float* dev_a, float* dev_b, float* dev_a_im, float* hist, unsigned int noffrequency, unsigned int legendresamples, unsigned int BlockSize, float* dev_kr_matrix)
{
    cudaError_t cudaStatus;

    dim3 threadsPerBlock(noffrequency, legendresamples, BlockSize);
    dim3 blocksPerGrid(1, 1, 1);

    if (noffrequency > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(noffrequency) / double(threadsPerBlock.x));
    }

    threadsPerBlock.y = 1;
    blocksPerGrid.y = 1;


    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }
    std::chrono::steady_clock::time_point begin, end;
    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    MagcomplexmatrixMultiplicationKern << <blocksPerGrid, threadsPerBlock >> > (dev_c, dev_c_im, dev_a, dev_b, dev_a_im, hist, noffrequency, legendresamples, dev_kr_matrix);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("MagnitudeCalc(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;
}
// Swap matrices
cudaError_t Swapmatrices(float* c, float* c_im, float* a, float* a_im, unsigned int noffrequency, unsigned int BlockSize, unsigned int legendresamples)
{
    cudaError_t cudaStatus;

    dim3 threadsPerBlock(noffrequency, legendresamples, 1);
    dim3 blocksPerGrid(1, 1, 1);

    if (noffrequency > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(noffrequency) / double(threadsPerBlock.x));
    }

    if (legendresamples > 16)
    {
        threadsPerBlock.y = 16;
        blocksPerGrid.y = ceil(double(legendresamples) / double(threadsPerBlock.y));
    }

    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }
    std::chrono::steady_clock::time_point begin, end;

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    SwapMatricesKernel << <blocksPerGrid, threadsPerBlock >> > (c, c_im, a, a_im, noffrequency, legendresamples);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("MatrixSwap(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;


}
//OMPSeparatedSources(dev_result, dev_result_im, dev_multiplication_kernel, dev_magnitudes_final, dev_magnitudes_im_final, dev_general_histogram, inp->NofAudioBlocks, inp->NofLegendreSamplingPoints, inp->FrequencybinAnalysis, iterationnumber, source_doas[sid].hpnumber);
cudaError_t OMPSeparatedSources(float* dev_result, float* dev_result_im, float* dev_prev_result, float* dev_prev_result_im, float* dev_multiplication_kernel, float* dev_multiplication_kernel_imag, float* dev_magnitudes, float* dev_magnitudes_im, float* dev_general_histogram, unsigned int BlockSize, unsigned int legendresamples, unsigned int NofFrequencyBins, unsigned int WindowLength, unsigned int nofiterations, float* sourcedoa, unsigned int nofsources, unsigned int prev_nofsources, float* dev_pressurelevel, float* dev_krmatrix)
{
    cudaError_t cudaStatus;

    //int ROW = blockIdx.y * blockDim.y + threadIdx.y;
    dim3 threadsPerBlock(WindowLength, nofsources,1 );
    dim3 blocksPerGrid(1, 1, 1);
    
    
    if (WindowLength > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(NofFrequencyBins) / double(threadsPerBlock.x));
    }
    if (nofsources > 1)
    {
        threadsPerBlock.y = 1;
        blocksPerGrid.y = ceil(double(nofsources) / double(threadsPerBlock.y));
    }
    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }
    std::chrono::steady_clock::time_point begin, end;

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    CalculateOMPSources << <blocksPerGrid, threadsPerBlock >> > (dev_result, dev_result_im, dev_prev_result, dev_prev_result_im, dev_multiplication_kernel, dev_multiplication_kernel_imag, dev_magnitudes, dev_magnitudes_im, dev_general_histogram, BlockSize, legendresamples, NofFrequencyBins, nofiterations, sourcedoa, nofsources, prev_nofsources, dev_pressurelevel, dev_krmatrix);
    // Check for any errors launching the kernel

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("MagnitudeCalc(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;

}

cudaError_t Swapmatrices2D(float* c, float* c_im, float* a, float* a_im, unsigned int size, unsigned int BlockSize)
{
    cudaError_t cudaStatus;

    dim3 threadsPerBlock(size, 1, 1);
    dim3 blocksPerGrid(1, 1, 1);

    // Dimension x is used for the frequency
    if (size > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = ceil(double(size) / double(threadsPerBlock.x));
    }

    // Dimension y is used for nothing
        threadsPerBlock.y = 1;
        blocksPerGrid.y = 1;


    // Dimension z is used for the block size
    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = ceil(double(BlockSize) / double(threadsPerBlock.z));
    }

    std::chrono::steady_clock::time_point begin, end;

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    SwapMatricesKernel2D << <blocksPerGrid, threadsPerBlock >> > (c, c_im, a, a_im, size);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();
    if (messageprint)
        printf("MatrixSwap(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;


}

// Device Helper function for using CUDA to subtract vectors in parallel.
cudaError_t Device_GetHistogram(float* dev_next, float* dev_next_im, float* dev_a,  unsigned int freqanalysis, unsigned int legendresamples, unsigned int BlockSize)
{
    cudaError_t cudaStatus;

    dim3 threadsPerBlock(freqanalysis, legendresamples,1);
    dim3 blocksPerGrid(1, 1, 1);

    if (freqanalysis > 16)
    {
        threadsPerBlock.x = 16;
        blocksPerGrid.x = freqanalysis / threadsPerBlock.x;
    }

    threadsPerBlock.y = 1;
    blocksPerGrid.y = 1;  

    if (BlockSize > 1)
    {
        threadsPerBlock.z = 2;
        blocksPerGrid.z = BlockSize / threadsPerBlock.z;
    }
    std::chrono::steady_clock::time_point begin, end;

    begin = std::chrono::steady_clock::now();
    // Launch a kernel on the GPU with one thread for each element.
    PlaneMaxValuedKernel << <blocksPerGrid, threadsPerBlock >> > (dev_next, dev_next_im, dev_a, freqanalysis, legendresamples);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
 // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    end = std::chrono::steady_clock::now();

    if (messageprint)
        printf("GetHistogram(us) = %d \n", std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count());

    return cudaStatus;
}
