#include <iostream>
#include <chrono>
#include <iomanip>
#include <omp.h>

using namespace std;
using namespace std::chrono;

// initialize the matrices
inline void initialize_matrices(float **a, float **l, float **u, int size) {
    #pragma omp for schedule(static)
    for (int i = 0; i < size; ++i) {
        a[i] = new float[size];
        l[i] = new float[size];
        u[i] = new float[size];
    }
}

bool matrix_validation(float **a, float **l, float **u, int size) {
    float **check = new float *[size];
    float **a2 = new float *[size];
    float **l2 = new float *[size];
    float **u2 = new float *[size];
    initialize_matrices(a2, l2, u2, size);
    for (int i = 0; i < size; ++i) {
        check[i] = new float[size];
    }

    l2 = l;
    u2 = u;

    // matrix multiplication
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            check[i][j] = 0;
            for (int k = 0; k < size; k++) {
                check[i][j] += u2[k][j] * l2[i][k];
            }
        }
    }
    int error = 0;
    cout << "Validation results: " << endl;
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            if (abs(check[i][j] - a[i][j]) > 1e8) {
                error = 1;
                cout << "Error index is: " << (check[i][j] - a[i][j]) << endl;
            }

        }
    }

    if (error == 1) {
        cout << "Failed" << endl;
        return false;
    } else {
        cout << "Success" << endl;
        return true;
    }
}

void print_matrix(float **m, int size, std::string matrix_name) {
    cout << "Matrix " << matrix_name << ": " << endl;
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            cout << std::fixed << std::setprecision(2) << m[i][j] << " ";
        }
        cout << endl;
    }
}

int main(int argc, char **argv) {
    int size, chunk_size;
    // user input section end
    cout << "Enter matrix size: ";
    cin >> size;
    float **a = new float *[size];
    float **l = new float *[size];
    float **u = new float *[size];

    initialize_matrices(a, l, u, size);

    // matrix value generation
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            a[i][j] = static_cast<float> (rand()) / static_cast<float> (RAND_MAX);
        }
    }

    if (size <= 8)
        print_matrix(a, size, "A");

    chunk_size = 1;
    #if defined(_OPENMP)
    omp_set_num_threads(16);
    #endif
    auto t_start = (high_resolution_clock::now());

    // TODO implement:
    /*
	- Implement parallel LU decomposition of matrix A
	- You will have to iterate throw each row
        - For L matrix set for given row each element above the diagonal to 0
        - For U matrix set for given row each element above the diagonal to 0
        - For L matrix set diagonal element to 1
        - Use equations from the slides to update rest of the row elements of matrix L
        - Use equations from the slides to update rest of the row elements of matrix U
	- Use simd operations and dynamic schedule for the best performance
	*/

    for (int iteration = 0; iteration < size; iteration++) {
        // calculates row elements in U matrix
        for (int c = 0; c < size; c++) {
            // values under diagonal
            if (c < iteration) {
                u[iteration][c] = 0;
                continue;
            }

            // rest of values
            float sum = 0.0f;
            // next step needs values from previous steps (iteration-steps) representing sub-row of L and sub-column of U
            for (int k = 0; k < iteration; k++) {
                sum += l[iteration][k] * u[k][c];
            }
            // update value
            u[iteration][c] = a[iteration][c] - sum;
        }

        // calculates column elements in L matrix
        for (int r = 0; r < size; r++) {
            // values above diagonal
            if (r < iteration) {
                l[r][iteration] = 0;
                continue;
            }
            // value on diagonal
            if (r == iteration) {
                l[r][iteration] = 1;
                continue;
            }

            // rest of values
            float sum = 0.0f;
            // next step needs values from previous steps (iteration-steps) representing sub-row of L and sub-column of U
            for (int k = 0; k < iteration; k++) {
                sum += l[r][k] * u[k][iteration];
            }
            // update value
            l[r][iteration] = (a[r][iteration] - sum) / u[iteration][iteration];
        }
    }

    auto t_end = (high_resolution_clock::now());
    double time = duration_cast<duration<double>>(t_end - t_start).count();
    cout << "LU Decomposition time: " << time << " seconds" << endl;
    if (matrix_validation(a, l, u, size) && size <= 8) {
        print_matrix(l, size, "L");
        print_matrix(u, size, "U");
    }
    return 0;
}
