#include <iostream>
#include <chrono>
#include <iomanip>
#include <omp.h>

using namespace std;
using namespace std::chrono;


// Function to create matrix with all initialized values 0.0
float** create_empty_matrix(int size) {
    float **m = new float *[size];
    #pragma omp for schedule(static)
    for (int i = 0; i < size; i++) {
        m[i] = new float[size];
    }
    return m;
}

// Function to create matrix with all randomly initialized values
float** create_random_matrix(int size) {
    float **m = create_empty_matrix(size);
    #pragma omp for schedule(static)
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            m[i][j] = static_cast<float> (rand()) / static_cast<float> (RAND_MAX);
        }
    }
    return m;
}

bool matrix_validation(float **a, float **l, float **u, int size) {
    float **mul_lu_result = create_empty_matrix(size);

    // matrix multiplication
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            for (int k = 0; k < size; k++) {
                mul_lu_result[i][j] += l[i][k] * u[k][j];
            }
        }
    }

    int error = 0;
    cout << "Validation results: " << endl;
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            if (abs(mul_lu_result[i][j] - a[i][j]) > 1e-3) {
                error = 1;
                cout << "Epsilon higher than ignored: " << abs(a[i][j] - mul_lu_result[i][j]) << endl;
            }
        }
    }

    if (error == 1) {
        cout << "Failed" << endl;
        return false;
    } else {
        cout << "Success" << endl;
        return true;
    }
}

void print_matrix(float **m, int size, std::string matrix_name) {
    cout << "Matrix " << matrix_name << ": " << endl;
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            cout << std::fixed << std::setprecision(2) << m[i][j] << " ";
        }
        cout << endl;
    }
}

int main(int argc, char **argv) {
    int size, chunk_size;
    // user input section end
    cout << "Enter matrix size: ";
    cin >> size;
    float **a = create_random_matrix(size);
    float **l = create_empty_matrix(size);
    float **u = create_empty_matrix(size);

    if (size <= 8)
        print_matrix(a, size, "A");

    chunk_size = 1;
    auto t_start = (high_resolution_clock::now());

    #pragma omp parallel shared(a, l, u)
    {
        for (int iteration = 0; iteration < size; iteration++) {
            // calculates row elements in U matrix
            #pragma omp for schedule(dynamic, chunk_size) // tasks are independent
            for (int c = 0; c < size; c++) {
                // values under diagonal
                if (c < iteration) {
                    u[iteration][c] = 0;
                    continue;
                }

                // rest of values
                float sum = 0.0f;
                // next step needs values from previous steps (iteration-steps) representing sub-row of L and sub-column of U
                #pragma omp simd reduction(+:sum)
                for (int k = 0; k < iteration; k++) {
                    sum += l[iteration][k] * u[k][c];
                }
                // update value
                u[iteration][c] = a[iteration][c] - sum;
            }

            // calculates column elements in L matrix
            #pragma omp for schedule(dynamic, chunk_size) // tasks are independent
            for (int r = 0; r < size; r++) {
                // values above diagonal
                if (r < iteration) {
                    l[r][iteration] = 0;
                    continue;
                }
                // value on diagonal
                if (r == iteration) {
                    l[r][iteration] = 1;
                    continue;
                }

                // rest of values
                float sum = 0.0f;
                // next step needs values from previous steps (iteration-steps) representing sub-row of L and sub-column of U
                #pragma omp simd reduction(+:sum)
                for (int k = 0; k < iteration; k++) {
                    sum += l[r][k] * u[k][iteration];
                }
                // update value
                l[r][iteration] = (a[r][iteration] - sum) / u[iteration][iteration];
            }
        }
    }

    auto t_end = (high_resolution_clock::now());
    double time = duration_cast<duration<double>>(t_end - t_start).count();
    cout << "LU Decomposition time: " << time << " seconds" << endl;
    if (matrix_validation(a, l, u, size) && size <= 8) {
        print_matrix(l, size, "L");
        print_matrix(u, size, "U");
    }
    return 0;
}
