
#include <iostream>
#include <algorithm>
#include <functional>
#include <thread>
#include <cmath>
#include <iomanip>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <getopt.h>

#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <vector>
#include <atomic>

using namespace std;

#define PAGE_SIZE (4096)
#define USER_BAR_SIZE (8192)

int ufd;
static int dev_id = 0;

std::vector<int> htc_fd, cth_fd, ev_fd;

class Timer
{
public:
    void start()
    {
        start_time = std::chrono::high_resolution_clock::now();
    }

    void end()
    {
        end_time = std::chrono::high_resolution_clock::now();
        duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
    }

    // void printDuration() const
    // {
    //     std::cout << "Duration: " << duration << " microseconds" << std::endl;
    // }
    long long duration;

private:
    std::chrono::high_resolution_clock::time_point start_time;
    std::chrono::high_resolution_clock::time_point end_time;
};

void dma_test()
{
    auto read_to_buffer = [=](int fd, char *buffer, uint64_t size, uint64_t base) -> uint64_t
    {
        ssize_t rc;
        uint64_t count = 0;
        char *buf = buffer;
        off_t offset = base;
        int loop = 0;

        while (count < size)
        {
            uint64_t bytes = size - count;

            rc = pread(fd, buf, bytes, offset);
            if (rc < 0)
            {
                return -EIO;
            }

            count += rc;
            if (rc != (ssize_t)bytes)
            {
                // printf("read underflow 0x%lx/0x%lx @ 0x%lx.\n",
                //     rc, bytes, offset);
                break;
            }

            buf += bytes;
            offset += bytes;
            loop++;
        }

        if (count != size && loop)
            printf("read underflow 0x%lx/0x%lx.\n",
                   count, size);
        return 0;
    };
    auto write_from_buffer = [=](int fd, char *buffer, uint64_t size, uint64_t base) -> uint64_t
    {
        ssize_t rc;
        uint64_t count = 0;
        char *buf = buffer;
        off_t offset = base;
        int loop = 0;
        // qDebug() << "need write data" << base << size;

        while (count < size)
        {
            uint64_t bytes = size - count;
            //            //if (offset)
            //            {
            //                rc = lseek(fd, offset, SEEK_SET);
            //                if (rc != offset) {
            //                    printf("w %d seek off 0x%lx != 0x%lx. %s\n",
            //                        fd, rc, offset, strerror(errno));
            //                    return -EIO;
            //                }
            //            }

            rc = pwrite(fd, buf, bytes, offset);
            if (rc < 0)
            {
                printf("\nDebug : write 0x%lx @ 0x%lx failed %ld.\n",
                       bytes, offset, rc);
                return -EIO;
            }

            count += rc;
            if (rc != (ssize_t)bytes)
            {
                printf("write underflow 0x%lx/0x%lx @ 0x%lx.\n",
                       rc, bytes, offset);
                // break;
            }
            buf += bytes;
            offset += bytes;
            loop++;
        }

        if (count != size && loop)
        {
            printf("write underflow 0x%lx/0x%lx.\n",
                   count, size);
        }
        return 0;
    };

    auto epdma_initial = [=]()
    {
        char path[1024];
        int ret;
        for (int i = 0; i < 1; i++)
        {
            snprintf(path, sizeof(path), "/dev/pcie_dma%d_htc_%d", dev_id, i);
            ret = access(path, F_OK | R_OK | W_OK);
            if (ret)
            {
                fprintf(stderr, "[Error] : Failed to open %s errno = %s\n", path, strerror(errno));
                exit(ret);
            }
            int m_htc_fd = open(path, O_RDWR);
            htc_fd.push_back(m_htc_fd);
            printf("%s: %d\n", path, m_htc_fd);

            snprintf(path, sizeof(path), "/dev/pcie_dma%d_cth_%d", dev_id, i);
            ret = access(path, F_OK | R_OK | W_OK);
            if (ret)
            {
                fprintf(stderr, "[Error] : Failed to open %s errno = %s\n", path, strerror(errno));
                exit(ret);
            }
            int m_cth_fd = open(path, O_RDWR);
            cth_fd.push_back(m_cth_fd);
            printf("%s: %d\n", path, m_cth_fd);
        }

        for (int i = 0; i < 16; i++)
        {
            snprintf(path, sizeof(path), "/dev/pcie_dma%d_events_%d", dev_id, i);
            if (access(path, F_OK | R_OK | W_OK))
            {
                continue;
            }
            int m_htc_fd = open(path, O_RDWR);
            ev_fd.push_back(m_htc_fd);
        }
    };
    srand(time(0));
    epdma_initial();

    auto dma_test = [=](uint64_t base, size_t size)
    {
        Timer timer;
        // long long exec_time;
        char *htc_buf = NULL;
        char *cth_buf = NULL;
        int ret;
        ret = posix_memalign((void **)&htc_buf, PAGE_SIZE, ((size + PAGE_SIZE) / PAGE_SIZE) * PAGE_SIZE);
        if (ret)
        {
            fprintf(stderr, "[Error] : Failed to allocate memory errno = %s\n", strerror(ret));
            exit(ret);
        }
        ret = posix_memalign((void **)&cth_buf, PAGE_SIZE, ((size + PAGE_SIZE) / PAGE_SIZE) * PAGE_SIZE);
        if (ret)
        {
            fprintf(stderr, "[Error] : Failed to allocate memory errno = %s\n", strerror(ret));
            exit(ret);
        }
        std::cout << "Transfer & Readback " << std::dec << std::setw(4) << std::setfill(' ') << size / 8 << " B at 0x";
        std::cout << std::hex << std::setw(8) << std::setfill('0') << base;
        fflush(stdout);
        memset(cth_buf, 0, size);
        for (size_t i = 0; i < size / 4; i++)
        {
            ((int *)htc_buf)[i] = rand();
        }

        std::atomic<int> mutex_l{0};
        // Write single thread
        auto wr_htc_thrd = [=](std::atomic<int> *m)
        {
            for (size_t i = 0; i < htc_fd.size(); i++)
            {
                int hfd = htc_fd[i];
                // Check use atomic load to ensure memory order and keep accessing the memory until m =1
                while (!m->load(std::memory_order_acquire))
                    std::this_thread::yield();

                write_from_buffer(hfd, htc_buf, size, base);
            };
            // printf("tx done ");
            fflush(stdout);
            *m = 0;
        };
        auto htc_thrd = new std::thread(wr_htc_thrd, &mutex_l);
        timer.start();
        for (size_t i = 0; i < htc_fd.size(); i++)
        {
            int cfd = cth_fd[i];
            mutex_l.store(1, std::memory_order_release);
            // mb();
            //  //printf("rx ready-> ");
            //  //fflush(stdout);
            while ((mutex_l.load(std::memory_order_acquire)) == 1)
                std::this_thread::yield();
	   // usleep(100);
            read_to_buffer(cfd, cth_buf, size, base);
        };
        timer.end();
        // printf(" rx done ");
        fflush(stdout);
        htc_thrd->join();

        // rthr->join();
        if (memcmp(htc_buf, cth_buf, size))
        {
            printf("... failed!\n");
            std::cout << "\nDEBUG - htc buffer\n";
            for (size_t i = 0; i < strlen(htc_buf); i++)
                std::cout << std::hex << std::setfill('0') << (htc_buf[i] & 0xFF) << ' ';
            std::cout << "\nDEBUG - cth buffer\n";
            for (size_t i = 0; i < (strlen(htc_buf)); i++)
                std::cout << std::hex << std::setfill('0') << (cth_buf[i] & 0xFF) << ' ';
            std::cout << "\n";
            exit(1);
        }
        else
            std::cout << " ... completed in " << std::dec << std::setw(4) << std::setfill(' ') << timer.duration << " us\n";

       // while ((mutex_l.load(std::memory_order_acquire)) == 1)
         //   std::this_thread::yield();
        delete htc_thrd;
        free(htc_buf);
        free(cth_buf);
    };

    /*for(int i = 0; i < 32; i++) {
        dma_test("dma test", i, 32);
    }

    for(int i = 0; i < 14; i++) {
        dma_test("dma test", 0, pow(2, i+1));
    }*/
    // uint64_t base_test = rand();
    // uint64_t len_test = 0x1A69; // max(1, rand()%0x2000);
    // dma_test("dma test", 0, max(1, rand()%0x2000));
    const char *prefix = "TEST : Random size write at random address";
    const size_t prefix_len = strlen(prefix);
    char *prefix_line = (char *)malloc(prefix_len + 1);
    memset(prefix_line, '*', prefix_len);
    prefix_line[prefix_len] = '\0';
    std::cout << '\n'
              << prefix_line << '\n'
              << prefix << '\n'
              << prefix_line << '\n';
    for (size_t i = 0;i < 1000; i++)
    {
        dma_test(rand(), max(1, rand() % 0x2000));
        //  dma_test("dma test", rand(), len_test);
        //  printf("count=%d\n", i);
    }
    std::cout << "\n";
    return;
}
void usage(char *program_name)
{
    std::cout << "Usage: " << program_name << " <option>\n";
    std::cout << "\t-d, --dev: Device ID of the DMA. [Optional] [0-3] [Default=0] \n";
}
int main(int argc, char **argv)
{
    int opt;
    const option long_opts[] =
        {
            {"dev", optional_argument, nullptr, 'd'},
            //{"mode", required_argument, nullptr, 'm'},
            {"help", no_argument, nullptr, 'h'},
            {0, 0, 0, 0}};

    while ((opt = getopt_long(argc, argv, "d:h", long_opts, nullptr)) != -1)
    {
        switch (opt)
        {
        case 'd':
            if (std::atoi(optarg) >= 0 && std::atoi(optarg) <= 3)
            {
                dev_id = std::atoi(optarg);
            }
            else
            {
                usage(argv[0]);
                exit(1);
            }
            break;
        case '?':
            usage(argv[0]);
            exit(EXIT_FAILURE);
        default:
            usage(argv[0]);
            break;
        }
    }
    dma_test();
    return 0;
}
