/* ============================ */
/* === test_stencil2_with.c === */
/* ============================ */


#include <stdio.h> // pour affichage
#include <stdlib.h>
#include <math.h>

#include <x86intrin.h>

//#include "nrdef.h"
#include "nrutil.h"
#include "stencil.h"
#include "stencil2_with.h"
#include "macro_debug.h"

// -------------------------
void test_add3_matrix(int n)
// -------------------------
{
    int b = 1; // bord
    int h = n;
    int w = n;
    puts("----------------------------------------------");
    printf("n = %d\n", n);
    
    //printf("test_morpho_max_routine h = %d w = %d w8 = %d, w = %d\n", h, w, w8, w);

    float **X;
    float **Y_bas, **Y_reg, **Y_rot, **Y_red, **Y_ilu3, **Y_ilu3r, **Y_elu2r, **Y_elu2rf, **Y_ilu3_elu2rf;
    float **Y_sep0, **Y_sep1;
    float **T; // tmp
    
    float x0, xstep, ystep;
    int c; // error
 
    char* format = "%5.0f";

    PUTS("malloc");
    X  = f32matrix(0-b, h-1+b, 0-b, w-1+b);
    T  = f32matrix(0-b, h-1+b, 0-b, w-1+b);
    
    Y_bas         = f32matrix(0, h-1, 0, w-1);
    Y_reg         = f32matrix(0, h-1, 0, w-1);
    Y_rot         = f32matrix(0, h-1, 0, w-1);
    Y_red         = f32matrix(0, h-1, 0, w-1);
    Y_ilu3        = f32matrix(0, h-1, 0, w-1);
    Y_ilu3r       = f32matrix(0, h-1, 0, w-1);
    Y_elu2r       = f32matrix(0, h-1, 0, w-1);
    Y_elu2rf      = f32matrix(0, h-1, 0, w-1);
    Y_ilu3_elu2rf = f32matrix(0, h-1, 0, w-1);/**/
    
    Y_bas         = f32matrix(0, h-1, 0, w-1);
    Y_reg         = f32matrix(0, h-1, 0, w-1);
    Y_rot         = f32matrix(0, h-1, 0, w-1);
    Y_red         = f32matrix(0, h-1, 0, w-1);
    Y_ilu3        = f32matrix(0, h-1, 0, w-1);
    Y_ilu3r       = f32matrix(0, h-1, 0, w-1);
    Y_elu2r       = f32matrix(0, h-1, 0, w-1);
    Y_elu2rf      = f32matrix(0, h-1, 0, w-1);
    Y_ilu3_elu2rf = f32matrix(0, h-1, 0, w-1);/**/
    
    Y_sep0        = f32matrix(0, h-1, 0, w-1);
    Y_sep1        = f32matrix(0, h-1, 0, w-1);
    
    PUTS("zero_matrix");
    zero_f32matrix(X,  0-b, h-1+b, 0-b, w-1+b);
    zero_f32matrix(T,  0-b, h-1+b, 0-b, w-1+b);

    zero_f32matrix(Y_bas,         0, h-1, 0, w-1);
    zero_f32matrix(Y_reg,         0, h-1, 0, w-1);
    zero_f32matrix(Y_rot,         0, h-1, 0, w-1);
    zero_f32matrix(Y_red,         0, h-1, 0, w-1);
    zero_f32matrix(Y_ilu3,        0, h-1, 0, w-1);
    zero_f32matrix(Y_ilu3r,       0, h-1, 0, w-1);
    zero_f32matrix(Y_elu2r,       0, h-1, 0, w-1);
    zero_f32matrix(Y_elu2rf,      0, h-1, 0, w-1);
    zero_f32matrix(Y_ilu3_elu2rf, 0, h-1, 0, w-1);
    
    zero_f32matrix(Y_sep0,        0, h-1, 0, w-1);
    zero_f32matrix(Y_sep1,        0, h-1, 0, w-1);
    
    x0 = 0; xstep = 1; ystep = 1;
    //x0 = 0; xstep = 1; ystep = 2;
    //x0 = 0; xstep = 1; ystep = 10;
    
    init_param_f32matrix(X, 0, h-1, 0, w-1, x0, xstep, ystep);
    //init_param_f32matrix(X, 0, h-1, 0, w-1, 1, 1, 2);
    //init_param_f32matrix(X, 0, h-1, 0, w-1, 1, 1, 10);
       
    //puts("display");
    //display_f32matrix (X,  0, h-1, 0, w-1, "%5d", "X");
    
    //puts("\n-- add3 --");
    add3_f32matrix_basic               (X, 0, h-1, 0, w-1, Y_bas);         // puts("bas");
    add3_f32matrix_reg                 (X, 0, h-1, 0, w-1, Y_reg);         // puts("reg");
    add3_f32matrix_rot                 (X, 0, h-1, 0, w-1, Y_rot);         // puts("rot");
    add3_f32matrix_red                 (X, 0, h-1, 0, w-1, Y_red);         // puts("red");
    add3_f32matrix_ilu3                (X, 0, h-1, 0, w-1, Y_ilu3);        // puts("ilu3");
    add3_f32matrix_ilu3_red            (X, 0, h-1, 0, w-1, Y_ilu3r);       // puts("ilu3_red");
    add3_f32matrix_elu2_red            (X, 0, h-1, 0, w-1, Y_elu2r);       // puts("elu2_red");
    add3_f32matrix_elu2_red_factor     (X, 0, h-1, 0, w-1, Y_elu2rf);      // puts("elu2_red_factor");
    add3_f32matrix_ilu3_elu2_red_factor(X, 0, h-1, 0, w-1, Y_ilu3_elu2rf); // puts("ilu3_elu2_red_factor");*/
    
    add3_f32matrix_sep0(X, 0, h-1, 0, w-1, Y_sep0, T);
    add3_f32matrix_sep1(X, 0, h-1, 0, w-1, Y_sep1, T);
    //puts("done\n");
    
    display_f32matrix(X,             0, h-1, 0, w-1, format, "X    ");
    display_f32matrix(Y_bas,         0, h-1, 0, w-1, format, "Y bas");
    display_f32matrix(Y_reg,         0, h-1, 0, w-1, format, "Y reg");
    
    /*
    display_f32matrix(Y_bas,         0, h-1, 0, w-1, format, "Y bas");
    display_f32matrix(Y_reg,         0, h-1, 0, w-1, format, "Y reg");
    display_f32matrix(Y_bas,         0, h-1, 0, w-1, format, "Y bas");
    display_f32matrix(Y_reg,         0, h-1, 0, w-1, format, "Y reg");
    display_f32matrix(Y_rot,         0, h-1, 0, w-1, format, "Y rot");
    display_f32matrix(Y_red,         0, h-1, 0, w-1, format, "Y red");
    display_f32matrix(Y_ilu3,        0, h-1, 0, w-1, format, "Y ilu3");
    display_f32matrix(Y_ilu3r,       0, h-1, 0, w-1, format, "Y ilu3r");
    display_f32matrix(Y_elu2r,       0, h-1, 0, w-1, format, "Y elu2r");
    display_f32matrix(Y_elu2rf,      0, h-1, 0, w-1, format, "Y elu2rf");
    display_f32matrix(Y_ilu3_elu2rf, 0, h-1, 0, w-1, format, "Y ilu3 elu2rf");*/
    
    //display_f32matrix(Y_bas,         0, h-1, 0, w-1, format, "Y bas");
    //display_f32matrix(Y_sep0,        0, h-1, 0, w-1, format, "Y sep0");
    //display_f32matrix(Y_sep1,        0, h-1, 0, w-1, format, "Y sep1");
    
    //puts("-- compare --");
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_reg        , "Y reg"                       );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_rot        , "Y rot"                       );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_red        , "Y red"                       );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_ilu3       , "Y ilu3"                      );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_ilu3r      , "Y ilu3 + red"                );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_elu2r      , "Y elu2 + red"                );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_elu2rf     , "Y ilu3 + red + factor"       );
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_ilu3_elu2rf, "Y ilu3 + elu2 + red + factor");
    
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_sep0, "Y sep0");
    c = compare_f32matrix(Y_bas, 0, h-1, 0, w-1, Y_sep1, "Y sep1");
    putchar('\n');
    
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y reg"                       );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y rot"                       );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y red"                       );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y ilu3"                      );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y ilu3 + red"                );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y elu2 + red"                );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y ilu3 + red + factor"       );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y ilu3 + elu2 + red + factor");
    
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y sep0"                       );
    verbose_check_f32matrix(Y_reg, 0, h-1, 0, w-1, x0, xstep, ystep, "Y sep1"                       );
    putchar('\n');
    
    PUTS("free_matrix");
    
    free_f32matrix(X,             0-b, h-1+b, 0-b, w-1+b);
    free_f32matrix(T,             0-b, h-1+b, 0-b, w-1+b);
    
    free_f32matrix(Y_bas,         0,   h-1,   0,   w-1);
    free_f32matrix(Y_reg,         0,   h-1,   0,   w-1);
    free_f32matrix(Y_rot,         0,   h-1,   0,   w-1);
    free_f32matrix(Y_red,         0,   h-1,   0,   w-1);
    free_f32matrix(Y_ilu3,        0,   h-1,   0,   w-1);
    free_f32matrix(Y_ilu3r,       0,   h-1,   0,   w-1);
    free_f32matrix(Y_elu2r,       0,   h-1,   0,   w-1);
    free_f32matrix(Y_elu2rf,      0,   h-1,   0,   w-1);
    free_f32matrix(Y_ilu3_elu2rf, 0,   h-1,   0,   w-1);/**/
    
    free_f32matrix(Y_sep0,        0,   h-1,   0,   w-1);
    free_f32matrix(Y_sep1,        0,   h-1,   0,   w-1);
}
// ----------------------------------------------
void bench_add3_matrix(int n0, int n1, int nstep)
// ----------------------------------------------
{
    int b = 1; // bord
    puts("--------------------------------------------------");
    //printf("test_morpho_max_routine h = %d w = %d w8 = %d, w = %d\n", h, w, w8, w);

    float **X;
    float **Y_bas, **Y_reg, **Y_rot, **Y_red, **Y_ilu3, **Y_ilu3r, **Y_elu2r, **Y_elu2rf, **Y_ilu3_elu2rf;
    float **Y_sep0, **Y_sep1;
    float **T; // tmp
    
    double cpp_bas;
    double cpp_reg;
    double cpp_rot;
    double cpp_sep0;
    double cpp_sep1;
    double cpp_red;
    double cpp_ilu3;
    double cpp_ilu3r;
    double cpp_elu2r;
    double cpp_elu2rf;
    double cpp_ilu3_elu2rf;
    
    char* format = "%8.2f";
    
    int c; // error
    
    PUTS("malloc to max");
    
    X  = f32matrix(0-b, n1 - 1 + b, 0 - b, n1 - 1 + b);
    T  = f32matrix(0-b, n1 - 1 + b, 0 - b, n1 - 1 + b);
    
    Y_bas         = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_reg         = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_rot         = f32matrix(0, n1 - 1, 0, n1 - 1);
    
    Y_sep0        = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_sep1        = f32matrix(0, n1 - 1, 0, n1 - 1);
    
    Y_red         = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_ilu3        = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_ilu3r       = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_elu2r       = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_elu2rf      = f32matrix(0, n1 - 1, 0, n1 - 1);
    Y_ilu3_elu2rf = f32matrix(0, n1 - 1, 0, n1 - 1);
    
        
    for(int n = n0; n <= n1; n += nstep) {
        
        //printf("i = %3d\n", n);
        
        resize_f32matrix(X, 0-b, n-1+b, 0-b, n-1+b);
        resize_f32matrix(T, 0-b, n-1+b, 0-b, n-1+b);
        
        resize_f32matrix(Y_bas,         0, n-1, 0, n-1);
        resize_f32matrix(Y_reg,         0, n-1, 0, n-1);
        resize_f32matrix(Y_rot,         0, n-1, 0, n-1);
        resize_f32matrix(Y_red,         0, n-1, 0, n-1);
        resize_f32matrix(Y_ilu3,        0, n-1, 0, n-1);
        resize_f32matrix(Y_ilu3r,       0, n-1, 0, n-1);
        resize_f32matrix(Y_elu2r,       0, n-1, 0, n-1);
        resize_f32matrix(Y_elu2rf,      0, n-1, 0, n-1);
        resize_f32matrix(Y_ilu3_elu2rf, 0, n-1, 0, n-1);
        resize_f32matrix(Y_sep0,        0, n-1, 0, n-1);
        resize_f32matrix(Y_sep1,        0, n-1, 0, n-1);
        
        zero_f32matrix(X, 0-b, n-1+b, 0-b, n-1+b);
        init_param_f32matrix(X, 0, n-1, 0, n-1, 1, 1, 1);
        
        zero_f32matrix(Y_bas,         0, n-1, 0, n-1);
        zero_f32matrix(Y_reg,         0, n-1, 0, n-1);
        zero_f32matrix(Y_rot,         0, n-1, 0, n-1);
        zero_f32matrix(Y_red,         0, n-1, 0, n-1);
        zero_f32matrix(Y_ilu3,        0, n-1, 0, n-1);
        zero_f32matrix(Y_ilu3r,       0, n-1, 0, n-1);
        zero_f32matrix(Y_elu2r,       0, n-1, 0, n-1);
        zero_f32matrix(Y_elu2rf,      0, n-1, 0, n-1);
        zero_f32matrix(Y_ilu3_elu2rf, 0, n-1, 0, n-1);
        zero_f32matrix(Y_sep0,        0, n-1, 0, n-1);
        zero_f32matrix(Y_sep1,        0, n-1, 0, n-1);
        
        BENCH(add3_f32matrix_basic               (X, 0, n-1, 0, n-1, Y_bas        ),n, cpp_bas        );
        BENCH(add3_f32matrix_reg                 (X, 0, n-1, 0, n-1, Y_reg        ),n, cpp_reg        );
        BENCH(add3_f32matrix_rot                 (X, 0, n-1, 0, n-1, Y_rot        ),n, cpp_rot        );
        BENCH(add3_f32matrix_red                 (X, 0, n-1, 0, n-1, Y_red        ),n, cpp_red        );
        BENCH(add3_f32matrix_ilu3                (X, 0, n-1, 0, n-1, Y_ilu3       ),n, cpp_ilu3       );
        BENCH(add3_f32matrix_ilu3_red            (X, 0, n-1, 0, n-1, Y_ilu3r      ),n, cpp_ilu3r      );
        BENCH(add3_f32matrix_elu2_red            (X, 0, n-1, 0, n-1, Y_elu2r      ),n, cpp_elu2r      );
        BENCH(add3_f32matrix_elu2_red_factor     (X, 0, n-1, 0, n-1, Y_elu2rf     ),n, cpp_elu2rf     );
        BENCH(add3_f32matrix_ilu3_elu2_red_factor(X, 0, n-1, 0, n-1, Y_ilu3_elu2rf),n, cpp_ilu3_elu2rf);
        BENCH(add3_f32matrix_sep0                (X, 0, n-1, 0, n-1, Y_sep0, T    ),n, cpp_sep0       );
        BENCH(add3_f32matrix_sep1                (X, 0, n-1, 0, n-1, Y_sep1, T    ),n, cpp_sep1       );
        
        printf("i = %4d", n);
        printf("   ");
        printf(format, cpp_bas        );
        printf(format, cpp_reg        );
        printf(format, cpp_rot        );
        printf("   ");
        printf(format, cpp_sep0       );
        printf(format, cpp_sep1       );
        printf("   ");
        printf(format, cpp_red        );
        printf(format, cpp_ilu3       );
        printf(format, cpp_ilu3r      );
        printf("   ");
        printf(format, cpp_elu2r      );
        printf(format, cpp_elu2rf     );
        printf(format, cpp_ilu3_elu2rf);
        putchar('\n');
    }
    
    PUTS("free_matrix");
    
    free_f32matrix(X,             0-b, n1-1+b, 0-b, n1-1+b);
    free_f32matrix(T,             0-b, n1-1+b, 0-b, n1-1+b);
    
    free_f32matrix(Y_bas,         0, n1-1, 0, n1-1);
    free_f32matrix(Y_reg,         0, n1-1, 0, n1-1);
    free_f32matrix(Y_rot,         0, n1-1, 0, n1-1);
    free_f32matrix(Y_red,         0, n1-1, 0, n1-1);
    free_f32matrix(Y_ilu3,        0, n1-1, 0, n1-1);
    free_f32matrix(Y_ilu3r,       0, n1-1, 0, n1-1);
    free_f32matrix(Y_elu2r,       0, n1-1, 0, n1-1);
    free_f32matrix(Y_elu2rf,      0, n1-1, 0, n1-1);
    free_f32matrix(Y_ilu3_elu2rf, 0, n1-1, 0, n1-1);/**/
    free_f32matrix(Y_sep0,        0, n1-1, 0, n1-1);
    free_f32matrix(Y_sep1,        0, n1-1, 0, n1-1);
}

// ==================================================
int main_stencil2_with(int argc, const char * argv[])
// ==================================================
{
    puts("--------------------------");
    puts("-- stencil2 with border --");
    puts("--------------------------");
    
    test_add3_matrix(6);
    //test_add3_matrix(6+1);
    //test_add3_matrix(6+2);
    //test_add3_matrix(6+3);
    
    //bench_add3_matrix(32, 256, 4);
    //bench_add3_matrix(32, 1024, 4);
    //bench_add3_matrix(32, 2048, 4);
    return 0;
}
