#include <iostream>
#include <vector>

#include "Utils.hpp"

using namespace std;

vector<double> normalizationSequential(vector<double> u) {
    double sumSquares = 0;
    for (int i = 0; i < u.size(); i++) {
        sumSquares += u[i] * u[i];
    }

    double vectorLength = sqrt(sumSquares);
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelSections(vector<double> u) {
    int lastIndexInFirstHalf = (u.size() - 1) / 2;

    double leftHalfSumSquares = 0.0;
    double rightHalfSumSquares = 0.0;
    #pragma omp parallel num_threads(2)
    {
        #pragma omp sections
        {
            #pragma omp section
            {
                double halfSumSquares = 0.0;
                for (int i = 0; i <= lastIndexInFirstHalf; i++) {
                    halfSumSquares += u[i] * u[i];
                }
                leftHalfSumSquares = halfSumSquares;
            }

            #pragma omp section
            {
                double halfSumSquares = 0.0;
                for (int i = lastIndexInFirstHalf + 1; i < u.size(); i++) {
                    halfSumSquares += u[i] * u[i];
                }
                rightHalfSumSquares = halfSumSquares;
            }
        }

        double vectorLength = sqrt(leftHalfSumSquares + rightHalfSumSquares);

        #pragma omp sections
        {
            #pragma omp section
            {
                for (int i = 0; i <= lastIndexInFirstHalf; i++) {
                    u[i] /= vectorLength;
                }
            }

            #pragma omp section
            {
                for (int i = lastIndexInFirstHalf + 1; i < u.size(); i++) {
                    u[i] /= vectorLength;
                }
            }
        }
    }

    return u;
}

vector<double> normalizationParallelForAndCriticalGlobal(vector<double> u) {
    double sumSquares = 0;
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        #pragma omp critical
        sumSquares += u[i] * u[i];
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelForAndCriticalLocal(vector<double> u) {
    double sumSquares = 0;
    #pragma omp parallel
    {
        double localSumSquares = 0.0;
        #pragma omp for
        for (int i = 0; i < u.size(); i++) {
            localSumSquares += u[i] * u[i];
        }

        #pragma omp critical
        sumSquares += localSumSquares;
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelForAndReduction(vector<double> u) {
    double sumSquares = 0.0;
    #pragma omp parallel for reduction(+:sumSquares)
    for (int i = 0; i < u.size(); i++) {
        sumSquares += u[i] * u[i];
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelForAndAtomicGlobal(vector<double> u) {
    double sumSquares = 0;
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        #pragma omp atomic update
        sumSquares += u[i] * u[i];
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelForAndAtomicLocal(vector<double> u) {
    double sumSquares = 0;
    #pragma omp parallel
    {
        double localSumSquares = 0.0;
        #pragma omp for
        for (int i = 0; i < u.size(); i++) {
            localSumSquares += u[i] * u[i];
        }

        #pragma omp atomic update
        sumSquares += localSumSquares;
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for
    for (int i = 0; i < u.size(); i++) {
        u[i] /= vectorLength;
    }

    return u;
}

vector<double> normalizationParallelSimd(vector<double> u) {
    double sumSquares = 0;

    // It is recommended not to use function call for loop bounds.
    int uSize = u.size();

    #pragma omp parallel for simd reduction(+:sumSquares)
    for (int i = 0; i < uSize; i++) {
        sumSquares += u[i] * u[i];
    }

    double vectorLength = sqrt(sumSquares);
    #pragma omp parallel for simd
    for (int i = 0; i < uSize; i++) {
        u[i] /= vectorLength;
    }

    return u;
}

int main() {
    vector<double> u = generateRandomVector(50000000);

    cout << "Length of the input vector: " << computeVectorLength(u) << endl;

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationSequential(u);
        sw.stop();
        cout << "Sequential: " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelSections(u);
        sw.stop();
        cout << "Parallel sections: " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelForAndCriticalGlobal(u);
        sw.stop();
        cout << "Parallel for + critical section (global): " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelForAndCriticalLocal(u);
        sw.stop();
        cout << "Parallel for + critical section (local): " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelForAndAtomicGlobal(u);
        sw.stop();
        cout << "Parallel for + atomic update (global): " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelForAndAtomicLocal(u);
        sw.stop();
        cout << "Parallel for + atomic update (local): " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelSimd(u);
        sw.stop();
        cout << "Parallel simd: " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    {
        Stopwatch sw;
        sw.start();
        auto uNorm = normalizationParallelForAndReduction(u);
        sw.stop();
        cout << "Parallel for + reduction: " << sw.duration().count() << " ms, length " << computeVectorLength(uNorm) << endl;
    }

    return 0;
}
