diff --git a/triangle_counting_host/cpp/Makefile b/triangle_counting_host/cpp/Makefile
new file mode 100644
index 0000000..aff882c
--- /dev/null
+++ b/triangle_counting_host/cpp/Makefile
@@ -0,0 +1,9 @@
+all:
+	g++ -O3 tc.cpp -std=c++11 -o tc -L. -lsds_lib
+	g++ -O3 tc_1pe.cpp -std=c++11 -o tc_1pe -L. -lsds_lib
+
+cpu:
+	g++ -O3 triangle_counting.cc tc_1pe.cpp -std=c++11 -o tc_cpu -L. -lsds_lib
+
+clean:
+	rm -rf tc tc_1pe tc_cpu 
diff --git a/triangle_counting_host/cpp/libsds_lib.so b/triangle_counting_host/cpp/libsds_lib.so
new file mode 100755
index 0000000..7fb4971
Binary files /dev/null and b/triangle_counting_host/cpp/libsds_lib.so differ
diff --git a/triangle_counting_host/cpp/libxlnk_cma.h b/triangle_counting_host/cpp/libxlnk_cma.h
new file mode 100644
index 0000000..1b8998e
--- /dev/null
+++ b/triangle_counting_host/cpp/libxlnk_cma.h
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+// kernel buffer pool
+#define XLNK_BUFPOOL_SIZE 100
+
+#define XLNK_DRIVER_PATH "/dev/xlnk"
+
+// counter of buffer currently instantiated
+static uint32_t xlnkBufCnt = 0;
+// virtual address of buffer
+static void *xlnkBufPool[2 * XLNK_BUFPOOL_SIZE];
+// length in bytes of buffer
+static size_t xlnkBufLens[2 * XLNK_BUFPOOL_SIZE];
+// physical address of buffer
+static uint32_t xlnkBufPhyPool[2 * XLNK_BUFPOOL_SIZE];
+
+/*
+ * Get the virtual address referencing the physical address resulting from
+ * mmaping /dev/mem.
+ * Required to use bare-metal drivers on linux. Return -1 in case of error.
+ */
+unsigned long cma_mmap(unsigned long phyAddr, uint32_t len);
+/*
+ * Unmap a previously mapped memory space.
+ */
+uint32_t cma_munmap(void *buf, uint32_t len);
+/*
+ * Allocate a physically contiguos chunk of CMA memory and map it into
+ * virtual memory space. Return this Virtual pointer. Returns -1 on failure.
+ */
+void *cma_alloc(uint32_t len, uint32_t cacheable);
+/*
+ * Return a physical memory address corresponding to a given Virtual address
+ * pointer. Returns NULL on failure.
+ */
+unsigned long cma_get_phy_addr(void *buf);
+/*
+ * Free a previously allocated CMA memory chunk.
+ */
+void cma_free(void *buf);
+/*
+ * Returns the number of available CMA memiry pages which can be allocated.
+ */
+uint32_t cma_pages_available();
+/*
+ * Extra functions in case user needs to flush or invalidate Cache.
+ */
+void cma_flush_cache(void *buf, unsigned int phys_addr, int size);
+void cma_invalidate_cache(void *buf, unsigned int phys_addr, int size);
diff --git a/triangle_counting_host/cpp/tc.cpp b/triangle_counting_host/cpp/tc.cpp
new file mode 100644
index 0000000..5ce9026
--- /dev/null
+++ b/triangle_counting_host/cpp/tc.cpp
@@ -0,0 +1,342 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <thread>
+#include <cstring>
+#include <cstdint>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <chrono>  // for high_resolution_clock
+
+extern "C"
+{
+#include "libxlnk_cma.h"
+}
+
+using namespace std;
+
+class accelerator
+{
+public:
+    accelerator(int base_addr=0x43C00000, int range=0x00010000) : base_addr(base_addr), range(range)
+    {
+        // virt_base = base_addr & ~(getpagesize() - 1);
+        virt_base = base_addr & ~(sysconf(_SC_PAGE_SIZE) - 1);
+        virt_offset = base_addr - virt_base;
+        mmap_file = open("/dev/mem", O_RDWR | O_SYNC);
+        if (mmap_file == -1) 
+            cout << "Unable to open /dev/mem" << endl;
+        mmap_addr = (int*)mmap(NULL, range + virt_offset, PROT_READ | PROT_WRITE,
+                        MAP_SHARED, mmap_file, virt_base);
+        if (mmap_addr == MAP_FAILED)
+            cout << "mmap fails. " << endl;
+
+        mmap_space = mmap_addr + virt_offset; 
+    }
+    ~accelerator() { close(mmap_file); }
+
+    int get(int offset) { return mmap_space[offset >> 2]; }
+
+    void set(int offset, int value) { mmap_space[offset >> 2] = value; }
+
+    void start() { mmap_space[0x00] |= 1; }
+
+    bool done()  { return (mmap_space[0x00] & (1 << 1)); }
+
+    bool idle() {  return (mmap_space[0x00] & (1 << 2)); }
+
+    bool ready() { return (mmap_space[0x00] & (1 << 3)); }
+
+    int get_return() {  return mmap_space[0x10 >> 2]; }
+
+    int program(string bitfile_name)
+    {
+        char buf[4194304];
+        const string BS_XDEVCFG = "/dev/xdevcfg";
+        const string BS_IS_PARTIAL = "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream";
+
+        int partial_bs_dev = open(BS_IS_PARTIAL.c_str(), O_WRONLY | O_NONBLOCK);
+        if (partial_bs_dev < 0)
+        {
+            printf("ERROR opening %s\n", BS_IS_PARTIAL.c_str());
+            return -1;
+        }
+        int write_size = write(partial_bs_dev, "0", 1); 
+
+        int fpga_dev = open(BS_XDEVCFG.c_str(), O_WRONLY | O_NONBLOCK);
+        // int fpga_dev = open(BS_XDEVCFG.c_str(), O_WRONLY);
+        if (fpga_dev < 0)
+        {
+            printf("ERROR opening %s\n", BS_XDEVCFG.c_str());
+            return -1;
+        }
+
+        int bit_file = open(bitfile_name.c_str(), O_RDONLY);
+        if (bit_file < 0)
+        {
+            printf("ERROR opening %s\n", bitfile_name.c_str());
+            return -1;
+        }
+
+        int bit_file_size = read(bit_file, buf, 4194304); 
+        write_size = write(fpga_dev, buf, bit_file_size); 
+
+        close(partial_bs_dev);
+        close(fpga_dev);
+        close(bit_file);
+        return 0;
+    }
+
+private:
+    int base_addr; 
+    int range; 
+    int virt_base; 
+    int virt_offset;
+    int mmap_file;
+    int *mmap_addr;
+    int *mmap_space;
+};
+
+void read_graph(const char *filename, 
+                std::vector<int> *edge_list, 
+                unsigned int num_pe, 
+                std::vector<int> &neighbor_list, 
+                std::vector<int> &offset_list)
+{
+    std::ifstream ifs(filename);
+
+    int degree_count = 0;
+    int prev_node = 0;
+    int pe_idx = 0;
+    offset_list.push_back(0);
+
+    if (ifs.is_open() && ifs.good())
+    {
+        std::string str; 
+        while (std::getline(ifs, str))
+        {
+            if (!str.empty() && str[0] != '#')
+            {
+                std::istringstream ss(str); 
+                int u, v; 
+                ss >> u >> v; 
+                if (prev_node != v)
+                {
+                    offset_list.push_back(degree_count);
+                }
+
+                prev_node = v; 
+                if (u < v)
+                {
+                    edge_list[pe_idx % num_pe].push_back(v);
+                    edge_list[pe_idx % num_pe].push_back(u);
+                    pe_idx++;
+                }
+                else
+                {
+                    neighbor_list.push_back(u);
+                    degree_count++;
+                }
+            }
+        }
+    }
+    ifs.close();
+    offset_list.push_back(degree_count);
+//    num_edge = edge_list.size() / 2;
+}
+
+int main( int argc, char** argv )
+{
+
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+//    int num_edge = 0; 
+    std::vector<int> edge_list[7], neighbor_list, offset_list;
+    read_graph("../graph/soc-Epinions1_adj.tsv", edge_list, 7, neighbor_list, offset_list);
+    std::cout << "neighbor_list size= " << neighbor_list.size() << std::endl;
+    std::cout << "offset_list size= " << offset_list.size() << std::endl;
+
+    auto t_file_done = std::chrono::high_resolution_clock::now();
+
+    int *edges0    = (int *)cma_alloc( edge_list[0].size()*sizeof(int), false);
+    int *edges1    = (int *)cma_alloc( edge_list[1].size()*sizeof(int), false);
+    int *edges2    = (int *)cma_alloc( edge_list[2].size()*sizeof(int), false);
+    int *edges3    = (int *)cma_alloc( edge_list[3].size()*sizeof(int), false);
+    int *edges4    = (int *)cma_alloc( edge_list[4].size()*sizeof(int), false);
+    int *edges5    = (int *)cma_alloc( edge_list[5].size()*sizeof(int), false);
+    int *edges6    = (int *)cma_alloc( edge_list[6].size()*sizeof(int), false);
+    int *neighbors = (int *)cma_alloc(neighbor_list.size()*sizeof(int), false);
+    int *offsets   = (int *)cma_alloc(  offset_list.size()*sizeof(int), false);
+    int *progress  = (int *)cma_alloc(                   5*sizeof(int), false);
+
+    auto t_malloc_done = std::chrono::high_resolution_clock::now();
+
+    std::memcpy(edges0   ,  edge_list[0].data(),  edge_list[0].size()*sizeof(int));
+    std::memcpy(edges1   ,  edge_list[1].data(),  edge_list[1].size()*sizeof(int));
+    std::memcpy(edges2   ,  edge_list[2].data(),  edge_list[2].size()*sizeof(int));
+    std::memcpy(edges3   ,  edge_list[3].data(),  edge_list[3].size()*sizeof(int));
+    std::memcpy(edges4   ,  edge_list[4].data(),  edge_list[4].size()*sizeof(int));
+    std::memcpy(edges5   ,  edge_list[5].data(),  edge_list[5].size()*sizeof(int));
+    std::memcpy(edges6   ,  edge_list[6].data(),  edge_list[6].size()*sizeof(int));
+    std::memcpy(neighbors, neighbor_list.data(), neighbor_list.size()*sizeof(int));
+    std::memcpy(offsets  ,   offset_list.data(),   offset_list.size()*sizeof(int));
+
+    auto t_memcpy_done = std::chrono::high_resolution_clock::now();
+
+    accelerator acc0(0x43C00000, 0x00010000); 
+    accelerator acc1(0x43C10000, 0x00010000); 
+    accelerator acc2(0x43C20000, 0x00010000); 
+    accelerator acc3(0x43C30000, 0x00010000); 
+    accelerator acc4(0x43C40000, 0x00010000); 
+    accelerator acc5(0x43C50000, 0x00010000); 
+    accelerator acc6(0x43C60000, 0x00010000); 
+    acc0.program("/home/xilinx/code/tc/tc_opt.bit");
+
+    auto t_program_done = std::chrono::high_resolution_clock::now();
+
+    acc0.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc0.set(0x20, cma_get_phy_addr(offsets)); 
+    acc0.set(0x28, cma_get_phy_addr(edges0)); 
+    acc0.set(0x30, edge_list[0].size()); 
+    acc0.set(0x38, cma_get_phy_addr(progress)); 
+
+    acc1.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc1.set(0x20, cma_get_phy_addr(offsets)); 
+    acc1.set(0x28, cma_get_phy_addr(edges1)); 
+    acc1.set(0x30, edge_list[1].size()); 
+    acc1.set(0x38, cma_get_phy_addr(progress)); 
+
+    acc2.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc2.set(0x20, cma_get_phy_addr(offsets)); 
+    acc2.set(0x28, cma_get_phy_addr(edges2)); 
+    acc2.set(0x30, edge_list[2].size()); 
+    acc2.set(0x38, cma_get_phy_addr(progress)); 
+    
+    acc3.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc3.set(0x20, cma_get_phy_addr(offsets)); 
+    acc3.set(0x28, cma_get_phy_addr(edges3)); 
+    acc3.set(0x30, edge_list[3].size()); 
+    acc3.set(0x38, cma_get_phy_addr(progress)); 
+    
+    acc4.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc4.set(0x20, cma_get_phy_addr(offsets)); 
+    acc4.set(0x28, cma_get_phy_addr(edges4)); 
+    acc4.set(0x30, edge_list[4].size()); 
+    acc4.set(0x38, cma_get_phy_addr(progress)); 
+    
+    acc5.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc5.set(0x20, cma_get_phy_addr(offsets)); 
+    acc5.set(0x28, cma_get_phy_addr(edges5)); 
+    acc5.set(0x30, edge_list[5].size()); 
+    acc5.set(0x38, cma_get_phy_addr(progress)); 
+    
+    acc6.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc6.set(0x20, cma_get_phy_addr(offsets)); 
+    acc6.set(0x28, cma_get_phy_addr(edges6)); 
+    acc6.set(0x30, edge_list[6].size()); 
+    acc6.set(0x38, cma_get_phy_addr(progress)); 
+    
+    cout << "start execute.." << endl;
+
+    auto t_acc_start = std::chrono::high_resolution_clock::now();
+    acc0.start(); 
+    acc1.start(); 
+    acc2.start(); 
+    acc3.start(); 
+    acc4.start(); 
+    acc5.start(); 
+    acc6.start(); 
+
+    int tik = 1;
+    while(!acc0.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+
+    std::cout << "acc0 done! " << std::endl;
+
+    while(!acc1.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc1 done! " << std::endl;
+
+    while(!acc2.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc2 done! " << std::endl;
+    
+    while(!acc3.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc3 done! " << std::endl;
+
+    while(!acc4.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc4 done! " << std::endl;
+    
+    while(!acc5.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc5 done! " << std::endl;
+    
+    while(!acc6.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+    }
+    std::cout << "acc6 done! " << std::endl;
+
+    auto t_acc_finish = std::chrono::high_resolution_clock::now();
+
+    cout << "\ndone execute.." << endl;
+
+    std::cout << "result = " << acc0.get_return() + acc1.get_return() + acc2.get_return() + acc3.get_return() + 
+                                acc4.get_return() + acc5.get_return() + acc6.get_return() << std::endl;
+
+    std::cout << "acc0 result = " << acc0.get_return() << std::endl;
+    std::cout << "acc1 result = " << acc1.get_return() << std::endl;
+    std::cout << "acc2 result = " << acc2.get_return() << std::endl;
+    std::cout << "acc3 result = " << acc3.get_return() << std::endl;
+    std::cout << "acc4 result = " << acc4.get_return() << std::endl;
+    std::cout << "acc5 result = " << acc5.get_return() << std::endl;
+    std::cout << "acc6 result = " << acc6.get_return() << std::endl;
+
+    std::chrono::duration<double> total_io_time = t_file_done - t_start; 
+    std::chrono::duration<double> total_malloc_time = t_malloc_done - t_file_done; 
+    std::chrono::duration<double> total_memcpy_time = t_memcpy_done - t_malloc_done; 
+    std::chrono::duration<double> total_program_time = t_program_done - t_memcpy_done; 
+    std::chrono::duration<double> total_exec_time = t_acc_finish - t_acc_start; 
+    std::cout << "File IO time: " << total_io_time.count() << "s" << std::endl;
+    std::cout << "CMA alloc time: " << total_malloc_time.count() << "s" << std::endl;
+    std::cout << "Memcpy time: " << total_memcpy_time.count() << "s" << std::endl;
+    std::cout << "FPGA program time: " << total_program_time.count() << "s" << std::endl;
+    std::cout << "Kernel exec time: " << total_exec_time.count() << "s" << std::endl;
+
+    cma_free(edges0);
+    cma_free(edges1);
+    cma_free(edges2);
+    cma_free(edges3);
+    cma_free(edges4);
+    cma_free(edges5);
+    cma_free(edges6);
+    cma_free(neighbors);
+    cma_free(offsets);
+    cma_free(progress);
+
+    return 0; 
+}
diff --git a/triangle_counting_host/cpp/tc_1pe.cpp b/triangle_counting_host/cpp/tc_1pe.cpp
new file mode 100644
index 0000000..fbbdedd
--- /dev/null
+++ b/triangle_counting_host/cpp/tc_1pe.cpp
@@ -0,0 +1,214 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <thread>
+#include <cstring>
+#include <cstdint>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <chrono>  // for high_resolution_clock
+
+extern "C"
+{
+#include "libxlnk_cma.h"
+}
+
+using namespace std;
+
+class accelerator
+{
+public:
+    accelerator(int base_addr=0x43C00000, int range=0x00010000) : base_addr(base_addr), range(range)
+    {
+        // virt_base = base_addr & ~(getpagesize() - 1);
+        virt_base = base_addr & ~(sysconf(_SC_PAGE_SIZE) - 1);
+        virt_offset = base_addr - virt_base;
+        mmap_file = open("/dev/mem", O_RDWR | O_SYNC);
+        if (mmap_file == -1) 
+            cout << "Unable to open /dev/mem" << endl;
+        mmap_addr = (int*)mmap(NULL, range + virt_offset, PROT_READ | PROT_WRITE,
+                        MAP_SHARED, mmap_file, virt_base);
+        if (mmap_addr == MAP_FAILED)
+            cout << "mmap fails. " << endl;
+
+        mmap_space = mmap_addr + virt_offset; 
+    }
+    ~accelerator() { close(mmap_file); }
+
+    int get(int offset) { return mmap_space[offset >> 2]; }
+
+    void set(int offset, int value) { mmap_space[offset >> 2] = value; }
+
+    void start() { mmap_space[0x00] |= 1; }
+
+    bool done()  { return (mmap_space[0x00] & (1 << 1)); }
+
+    bool idle() {  return (mmap_space[0x00] & (1 << 2)); }
+
+    bool ready() { return (mmap_space[0x00] & (1 << 3)); }
+
+    int get_return() {  return mmap_space[0x10 >> 2]; }
+
+    int program(string bitfile_name)
+    {
+        char buf[4194304];
+        const string BS_XDEVCFG = "/dev/xdevcfg";
+        const string BS_IS_PARTIAL = "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream";
+
+        int partial_bs_dev = open(BS_IS_PARTIAL.c_str(), O_WRONLY | O_NONBLOCK);
+        if (partial_bs_dev < 0)
+        {
+            printf("ERROR opening %s\n", BS_IS_PARTIAL.c_str());
+            return -1;
+        }
+        int write_size = write(partial_bs_dev, "0", 1); 
+
+        int fpga_dev = open(BS_XDEVCFG.c_str(), O_WRONLY | O_NONBLOCK);
+        // int fpga_dev = open(BS_XDEVCFG.c_str(), O_WRONLY);
+        if (fpga_dev < 0)
+        {
+            printf("ERROR opening %s\n", BS_XDEVCFG.c_str());
+            return -1;
+        }
+
+        int bit_file = open(bitfile_name.c_str(), O_RDONLY);
+        if (bit_file < 0)
+        {
+            printf("ERROR opening %s\n", bitfile_name.c_str());
+            return -1;
+        }
+
+        int bit_file_size = read(bit_file, buf, 4194304); 
+        write_size = write(fpga_dev, buf, bit_file_size); 
+
+        close(partial_bs_dev);
+        close(fpga_dev);
+        close(bit_file);
+        return 0;
+    }
+
+private:
+    int base_addr; 
+    int range; 
+    int virt_base; 
+    int virt_offset;
+    int mmap_file;
+    int *mmap_addr;
+    int *mmap_space;
+};
+
+void read_graph(const char *filename, 
+                std::vector<int> &edge_list, 
+                std::vector<int> &neighbor_list, 
+                std::vector<int> &offset_list, 
+                int &num_edge)
+{
+    std::ifstream ifs(filename);
+
+    int degree_count = 0;
+    int prev_node = 0;
+    offset_list.push_back(0);
+
+    if (ifs.is_open() && ifs.good())
+    {
+        std::string str; 
+        while (std::getline(ifs, str))
+        {
+            if (!str.empty() && str[0] != '#')
+            {
+                std::istringstream ss(str); 
+                int u, v; 
+                ss >> u >> v; 
+                if (prev_node != v)
+                {
+                    offset_list.push_back(degree_count);
+                }
+
+                prev_node = v; 
+                if (u < v)
+                {
+                    edge_list.push_back(v);
+                    edge_list.push_back(u);
+                }
+                else
+                {
+                    neighbor_list.push_back(u);
+                    degree_count++;
+                }
+            }
+        }
+    }
+    ifs.close();
+    offset_list.push_back(degree_count);
+    num_edge = edge_list.size() / 2;
+}
+
+int main( int argc, char** argv )
+{
+
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+    int num_edge = 0; 
+    std::vector<int> edge_list, neighbor_list, offset_list;
+    read_graph("../graph/soc-Epinions1_adj.tsv", edge_list, neighbor_list, offset_list, num_edge);
+    std::cout << "neighbor_list size= " << neighbor_list.size() << std::endl;
+    std::cout << "offset_list size= " << offset_list.size() << std::endl;
+    std::cout << "edge_list size= " << edge_list.size() << std::endl;
+    std::cout << "initialized num_edge = " << num_edge << std::endl;
+
+    int *edges     = (int *)cma_alloc(    edge_list.size()*sizeof(int), false);
+    int *neighbors = (int *)cma_alloc(neighbor_list.size()*sizeof(int), false);
+    int *offsets   = (int *)cma_alloc(  offset_list.size()*sizeof(int), false);
+    int *progress  = (int *)cma_alloc(                   5*sizeof(int), false);
+
+    std::memcpy(edges    ,     edge_list.data(),     edge_list.size()*sizeof(int));
+    std::memcpy(neighbors, neighbor_list.data(), neighbor_list.size()*sizeof(int));
+    std::memcpy(offsets  ,   offset_list.data(),   offset_list.size()*sizeof(int));
+
+    accelerator acc; 
+    acc.program("/home/xilinx/code/tc/triangle_counting.bit");
+
+    auto t_program_done = std::chrono::high_resolution_clock::now();
+
+    acc.set(0x18, cma_get_phy_addr(neighbors)); 
+    acc.set(0x20, cma_get_phy_addr(offsets)); 
+    acc.set(0x28, cma_get_phy_addr(edges)); 
+    acc.set(0x30, edge_list.size()); 
+    acc.set(0x38, cma_get_phy_addr(progress)); 
+
+    cout << "start execute.." << endl;
+
+    auto t_acc_start = std::chrono::high_resolution_clock::now();
+
+    acc.start(); 
+
+    int tik = 1;
+    while(!acc.done())
+    {
+        tik++;
+        if ((tik % 10000) == 0) std::cout << "."; 
+        //std::cout << tik << std::endl; 
+/*       std::cout << progress[0] << " " << progress[1] << 
+       " " << progress[2] << " " << progress[3] << " " << progress[4] << 
+       " " << acc.get_return() << std::endl;*/
+    }
+
+    auto t_acc_finish = std::chrono::high_resolution_clock::now();
+
+    cout << "\ndone execute.." << endl;
+
+    std::cout << "result = " << acc.get_return() << std::endl;
+
+    std::chrono::duration<double> total_exec_time = t_acc_finish - t_acc_start; 
+    std::cout << "Kernel exec time: " << total_exec_time.count() << "s" << std::endl;
+
+    cma_free(edges);
+    cma_free(neighbors);
+    cma_free(offsets);
+    cma_free(progress);
+
+    return 0; 
+}
diff --git a/triangle_counting_host/python/graph_parser.py b/triangle_counting_host/python/graph_parser.py
new file mode 100644
index 0000000..5a42a09
--- /dev/null
+++ b/triangle_counting_host/python/graph_parser.py
@@ -0,0 +1,37 @@
+
+neighbor_list = []
+offset_list = [0]
+edge_list = []
+
+graph_file = open("graph/test.tsv")
+lines = graph_file.readlines()
+
+degree_count = 0
+prev_node = 0
+
+for line in lines:
+    node_a, node_b, _ = map(int, line.split())
+    if prev_node != node_b:
+        offset_list.append(degree_count)
+
+    prev_node = node_b
+    if node_a < node_b:
+        edge_list.extend([node_b, node_a])
+    else:
+        neighbor_list.append(node_a)
+        degree_count += 1
+
+offset_list.append(degree_count)
+
+graph_file.close()
+
+print("neighbor_list size = ", len(neighbor_list))
+print("offset_list size = ", len(offset_list))
+print("edge_list size = ", len(edge_list))
+
+f = open("test_parsed.tsv", "w")
+f.write("%d %d %d\n" % (len(neighbor_list), len(offset_list), len(edge_list)))
+f.write(" ".join(str(e) for e in neighbor_list) + "\n")
+f.write(" ".join(str(e) for e in offset_list) + "\n")
+f.write(" ".join(str(e) for e in edge_list) + "\n")
+f.close()
diff --git a/triangle_counting_host/python/intersect_host.py b/triangle_counting_host/python/intersect_host.py
new file mode 100644
index 0000000..b9afa3a
--- /dev/null
+++ b/triangle_counting_host/python/intersect_host.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+
+import sys
+import numpy as np 
+import os
+import time
+from datetime import datetime
+from pynq import Xlnk
+from pynq import Overlay
+
+# load our design overlay
+overlay = Overlay('intersect_hw.bit')
+print("intersect_hw.bit loaded")
+
+myIP = overlay.intersect_0
+
+xlnk = Xlnk()
+
+t1 = time.time()
+
+input_a = xlnk.cma_array(shape=(4096,), dtype=np.int32)
+input_b = xlnk.cma_array(shape=(4096,), dtype=np.int32)
+
+for i in range(4096):
+	input_a[i] = i
+	input_b[i] = i + 1
+
+myIP.write(0x18, input_a.physical_address)
+myIP.write(0x20, input_b.physical_address)
+
+myIP.write(0x28, 2)
+myIP.write(0x30, 2)
+
+
+t2 = time.time()
+t = t2 - t1
+print("Preparing input data time: ", str(t))
+
+isready = 0;
+myIP.write(0x00, 1)
+
+while( isready != 6 ):
+    isready = myIP.read(0x00)
+
+t3 = time.time()
+t = t3 - t2
+#tbatch = tbatch + t
+#print("Computation finished")
+print("PL Time: ", str(t))
+
+print("Return value: ", myIP.read(0x10))
+
diff --git a/triangle_counting_host/python/tc_host.py b/triangle_counting_host/python/tc_host.py
new file mode 100644
index 0000000..046f90d
--- /dev/null
+++ b/triangle_counting_host/python/tc_host.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+
+import sys
+import numpy as np 
+import os
+import time
+from datetime import datetime
+from pynq import Xlnk
+from pynq import Overlay
+
+# load our design overlay
+overlay = Overlay('triangle_counting.bit')
+print("triangle_counting.bit loaded")
+
+myIP = overlay.triangle_counting_0
+
+t0 = time.time()
+
+neighbor_list = []
+offset_list = [0]
+edge_list = []
+
+graph_file = open("graph/soc-Epinions1_adj.tsv")
+# graph_file = open("graph/test.tsv")
+lines = graph_file.readlines()
+
+degree_count = 0
+prev_node = 0
+
+for line in lines:
+    node_a, node_b, _ = map(int, line.split())
+    if prev_node != node_b:
+        offset_list.append(degree_count)
+
+    prev_node = node_b
+    if node_a < node_b:
+        edge_list.extend([node_b, node_a])
+    else:
+        neighbor_list.append(node_a)
+        degree_count += 1
+
+offset_list.append(degree_count)
+
+print("neighbor_list size= ", len(neighbor_list))
+print("offset_list size= ", len(offset_list))
+print("edge_list size= ", len(edge_list))
+
+t1 = time.time()
+
+print("Finished reading graph file. ")
+t = t1 - t0
+print("Reading input file time: ", str(t))
+
+xlnk = Xlnk()
+
+neighbor = xlnk.cma_array(shape=(len(neighbor_list),), dtype=np.int32)
+offset   = xlnk.cma_array(shape=(len(offset_list),), dtype=np.int32)
+edge     = xlnk.cma_array(shape=(len(edge_list),), dtype=np.int32)
+progress = xlnk.cma_array(shape=(5,), dtype=np.int32)
+
+neighbor[:] = neighbor_list
+offset[:] = offset_list
+edge[:] = edge_list
+
+# neighbor[:] = [2, 4, 5, 3, 4, 5, 4, 5, 5]
+# offset[:]   = [0, 0, 3, 6, 8, 9, 9]
+# edge[:]     = [5, 4, 5, 3, 5, 2, 5, 1, 4, 3, 4, 2, 4, 1, 3, 2, 2, 1]
+
+myIP.write(0x18, neighbor.physical_address)
+myIP.write(0x20, offset.physical_address)
+myIP.write(0x28, edge.physical_address)
+myIP.write(0x30, len(edge_list))
+myIP.write(0x38, progress.physical_address)
+
+# for i in range(neighbor.size):
+#     print("neighbor[%d] = %d" % (i, neighbor[i]))
+
+# for i in range(offset.size):
+#     print("offset[%d] = %d" % (i, offset[i]))
+
+# for i in range(edge.size):
+#     print("edge[%d] = %d" % (i, edge[i]))
+
+t2 = time.time()
+t = t2 - t1
+print("Preparing input data time: ", str(t))
+
+isready = 0;
+myIP.write(0x00, 1)
+
+while( isready != 6 ):
+#    print(progress[0], progress[1], progress[2], progress[3], progress[4])
+    isready = myIP.read(0x00)
+
+t3 = time.time()
+t = t3 - t2
+#tbatch = tbatch + t
+#print("Computation finished")
+print("PL Time: ", str(t))
+
+print("Return value: ", myIP.read(0x10))
diff --git a/triangle_counting_host/python/tc_host_opt_4.py b/triangle_counting_host/python/tc_host_opt_4.py
new file mode 100644
index 0000000..9e295d2
--- /dev/null
+++ b/triangle_counting_host/python/tc_host_opt_4.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+
+import sys
+import numpy as np 
+import os
+import time
+import math
+from datetime import datetime
+from pynq import Xlnk
+from pynq import Overlay
+
+# load our design overlay
+overlay = Overlay('tc_opt_4.bit')
+print("tc_opt_4.bit loaded")
+
+acc0 = overlay.triangle_counting_0
+acc1 = overlay.triangle_counting_1
+acc2 = overlay.triangle_counting_2
+acc3 = overlay.triangle_counting_3
+
+t0 = time.time()
+
+neighbor_list = []
+offset_list = [0]
+edge_list = []
+
+graph_file = open("../../graph/soc-Epinions1_adj.tsv")
+# graph_file = open("graph/test.tsv")
+lines = graph_file.readlines()
+
+degree_count = 0
+prev_node = 0
+
+for line in lines:
+    node_a, node_b, _ = map(int, line.split())
+    if prev_node != node_b:
+        offset_list.append(degree_count)
+
+    prev_node = node_b
+    if node_a < node_b:
+        edge_list.extend([node_b, node_a])
+    else:
+        neighbor_list.append(node_a)
+        degree_count += 1
+
+offset_list.append(degree_count)
+
+print("neighbor_list size= ", len(neighbor_list))
+print("offset_list size= ", len(offset_list))
+print("edge_list size= ", len(edge_list))
+
+t1 = time.time()
+
+print("Finished reading graph file. ")
+t = t1 - t0
+print("Reading input file time: ", str(t))
+
+xlnk = Xlnk()
+
+num_edge = int(len(edge_list) / 2)
+num_batch = 4
+num_edge_batch = int(math.floor(float(num_edge) / num_batch))
+num_edge_last_batch = num_edge - (num_batch-1)*num_edge_batch
+
+print(num_edge)
+print(num_batch)
+print(num_edge_batch)
+print(num_edge_last_batch)
+
+neighbor = xlnk.cma_array(shape=(len(neighbor_list),), dtype=np.int32)
+offset   = xlnk.cma_array(shape=(len(offset_list),), dtype=np.int32)
+edge1    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge2    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge3    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge4    = xlnk.cma_array(shape=(2*num_edge_last_batch,), dtype=np.int32)
+progress = xlnk.cma_array(shape=(5,), dtype=np.int32)
+
+neighbor[:] = neighbor_list
+offset[:] = offset_list
+edge1[:] = edge_list[0:2*num_edge_batch]
+edge2[:] = edge_list[2*num_edge_batch:4*num_edge_batch]
+edge3[:] = edge_list[4*num_edge_batch:6*num_edge_batch]
+edge4[:] = edge_list[6*num_edge_batch:]
+
+# neighbor[:] = [2, 4, 5, 3, 4, 5, 4, 5, 5]
+# offset[:]   = [0, 0, 3, 6, 8, 9, 9]
+# edge[:]     = [5, 4, 5, 3, 5, 2, 5, 1, 4, 3, 4, 2, 4, 1, 3, 2, 2, 1]
+
+acc0.write(0x00018, neighbor.physical_address)
+acc0.write(0x00020, offset.physical_address)
+acc0.write(0x00028, edge1.physical_address)
+acc0.write(0x00030, 2*num_edge_batch)
+acc0.write(0x00038, progress.physical_address)
+
+acc1.write(0x00018, neighbor.physical_address)
+acc1.write(0x00020, offset.physical_address)
+acc1.write(0x00028, edge2.physical_address)
+acc1.write(0x00030, 2*num_edge_batch)
+acc1.write(0x00038, progress.physical_address)
+
+acc2.write(0x00018, neighbor.physical_address)
+acc2.write(0x00020, offset.physical_address)
+acc2.write(0x00028, edge3.physical_address)
+acc2.write(0x00030, 2*num_edge_batch)
+acc2.write(0x00038, progress.physical_address)
+
+acc3.write(0x00018, neighbor.physical_address)
+acc3.write(0x00020, offset.physical_address)
+acc3.write(0x00028, edge4.physical_address)
+acc3.write(0x00030, 2*num_edge_last_batch)
+acc3.write(0x00038, progress.physical_address)
+
+# for i in range(neighbor.size):
+#     print("neighbor[%d] = %d" % (i, neighbor[i]))
+
+# for i in range(offset.size):
+#     print("offset[%d] = %d" % (i, offset[i]))
+
+# for i in range(edge.size):
+#     print("edge[%d] = %d" % (i, edge[i]))
+
+t2 = time.time()
+t = t2 - t1
+print("Preparing input data time: ", str(t))
+
+acc0.write(0x00000, 1)
+acc1.write(0x00000, 1)
+acc2.write(0x00000, 1)
+acc3.write(0x00000, 1)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc0.read(0x00000)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc1.read(0x00000)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc2.read(0x00000)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc3.read(0x00000)
+
+t3 = time.time()
+t = t3 - t2
+#tbatch = tbatch + t
+#print("Computation finished")
+print("PL Time: ", str(t))
+
+result1 = acc0.read(0x00010)
+result2 = acc1.read(0x00010)
+result3 = acc2.read(0x00010)
+result4 = acc3.read(0x00010)
+
+print("Return value 1: ", result1)
+print("Return value 2: ", result2)
+print("Return value 3: ", result3)
+print("Return value 4: ", result4)
+print("Number of triangles: ", result1+result2+result3+result4)
diff --git a/triangle_counting_host/python/tc_host_opt_7.py b/triangle_counting_host/python/tc_host_opt_7.py
new file mode 100644
index 0000000..22df986
--- /dev/null
+++ b/triangle_counting_host/python/tc_host_opt_7.py
@@ -0,0 +1,210 @@
+# coding: utf-8
+
+import sys
+import numpy as np 
+import os
+import time
+import math
+from datetime import datetime
+from pynq import Xlnk
+from pynq import Overlay
+
+# load our design overlay
+overlay = Overlay('tc_opt.bit')
+print("tc_opt.bit loaded")
+
+acc0 = overlay.triangle_counting_0
+acc1 = overlay.triangle_counting_1
+acc2 = overlay.triangle_counting_2
+acc3 = overlay.triangle_counting_3
+acc4 = overlay.triangle_counting_4
+acc5 = overlay.triangle_counting_5
+acc6 = overlay.triangle_counting_6
+
+t0 = time.time()
+
+neighbor_list = []
+offset_list = [0]
+edge_list = []
+
+graph_file = open("graph/soc-Epinions1_adj.tsv")
+# graph_file = open("graph/test.tsv")
+lines = graph_file.readlines()
+
+degree_count = 0
+prev_node = 0
+
+for line in lines:
+    node_a, node_b, _ = map(int, line.split())
+    if prev_node != node_b:
+        offset_list.append(degree_count)
+
+    prev_node = node_b
+    if node_a < node_b:
+        edge_list.extend([node_b, node_a])
+    else:
+        neighbor_list.append(node_a)
+        degree_count += 1
+
+offset_list.append(degree_count)
+
+print("neighbor_list size= ", len(neighbor_list))
+print("offset_list size= ", len(offset_list))
+print("edge_list size= ", len(edge_list))
+
+t1 = time.time()
+
+print("Finished reading graph file. ")
+t = t1 - t0
+print("Reading input file time: ", str(t))
+
+xlnk = Xlnk()
+
+num_edge = int(len(edge_list) / 2)
+num_batch = 7
+num_edge_batch = int(math.floor(float(num_edge) / num_batch))
+num_edge_last_batch = num_edge - (num_batch-1)*num_edge_batch
+
+print(num_edge)
+print(num_batch)
+print(num_edge_batch)
+print(num_edge_last_batch)
+
+neighbor = xlnk.cma_array(shape=(len(neighbor_list),), dtype=np.int32)
+offset   = xlnk.cma_array(shape=(len(offset_list),), dtype=np.int32)
+edge1    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge2    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge3    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge4    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge5    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge6    = xlnk.cma_array(shape=(2*num_edge_batch,), dtype=np.int32)
+edge7    = xlnk.cma_array(shape=(2*num_edge_last_batch,), dtype=np.int32)
+progress = xlnk.cma_array(shape=(5,), dtype=np.int32)
+
+neighbor[:] = neighbor_list
+offset[:] = offset_list
+edge1[:] = edge_list[0:2*num_edge_batch]
+edge2[:] = edge_list[2*num_edge_batch:4*num_edge_batch]
+edge3[:] = edge_list[4*num_edge_batch:6*num_edge_batch]
+edge4[:] = edge_list[6*num_edge_batch:8*num_edge_batch]
+edge5[:] = edge_list[8*num_edge_batch:10*num_edge_batch]
+edge6[:] = edge_list[10*num_edge_batch:12*num_edge_batch]
+edge7[:] = edge_list[12*num_edge_batch:]
+
+# neighbor[:] = [2, 4, 5, 3, 4, 5, 4, 5, 5]
+# offset[:]   = [0, 0, 3, 6, 8, 9, 9]
+# edge[:]     = [5, 4, 5, 3, 5, 2, 5, 1, 4, 3, 4, 2, 4, 1, 3, 2, 2, 1]
+
+acc0.write(0x18, neighbor.physical_address)
+acc0.write(0x20, offset.physical_address)
+acc0.write(0x28, edge1.physical_address)
+acc0.write(0x30, 2*num_edge_batch)
+acc0.write(0x38, progress.physical_address)
+
+acc1.write(0x18, neighbor.physical_address)
+acc1.write(0x20, offset.physical_address)
+acc1.write(0x28, edge2.physical_address)
+acc1.write(0x30, 2*num_edge_batch)
+acc1.write(0x38, progress.physical_address)
+
+acc2.write(0x18, neighbor.physical_address)
+acc2.write(0x20, offset.physical_address)
+acc2.write(0x28, edge3.physical_address)
+acc2.write(0x30, 2*num_edge_batch)
+acc2.write(0x38, progress.physical_address)
+
+acc3.write(0x18, neighbor.physical_address)
+acc3.write(0x20, offset.physical_address)
+acc3.write(0x28, edge4.physical_address)
+acc3.write(0x30, 2*num_edge_batch)
+acc3.write(0x38, progress.physical_address)
+
+acc4.write(0x18, neighbor.physical_address)
+acc4.write(0x20, offset.physical_address)
+acc4.write(0x28, edge5.physical_address)
+acc4.write(0x30, 2*num_edge_batch)
+acc4.write(0x38, progress.physical_address)
+
+acc5.write(0x18, neighbor.physical_address)
+acc5.write(0x20, offset.physical_address)
+acc5.write(0x28, edge6.physical_address)
+acc5.write(0x30, 2*num_edge_batch)
+acc5.write(0x38, progress.physical_address)
+
+acc6.write(0x18, neighbor.physical_address)
+acc6.write(0x20, offset.physical_address)
+acc6.write(0x28, edge7.physical_address)
+acc6.write(0x30, 2*num_edge_last_batch)
+acc6.write(0x38, progress.physical_address)
+
+# for i in range(neighbor.size):
+#     print("neighbor[%d] = %d" % (i, neighbor[i]))
+
+# for i in range(offset.size):
+#     print("offset[%d] = %d" % (i, offset[i]))
+
+# for i in range(edge.size):
+#     print("edge[%d] = %d" % (i, edge[i]))
+
+t2 = time.time()
+t = t2 - t1
+print("Preparing input data time: ", str(t))
+
+acc0.write(0x00, 1)
+acc1.write(0x00, 1)
+acc2.write(0x00, 1)
+acc3.write(0x00, 1)
+acc4.write(0x00, 1)
+acc5.write(0x00, 1)
+acc6.write(0x00, 1)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc0.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc1.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc2.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc3.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc4.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc5.read(0x00)
+
+isready = 0;
+while( isready != 6 ):
+    isready = acc6.read(0x00)
+
+t3 = time.time()
+t = t3 - t2
+#tbatch = tbatch + t
+#print("Computation finished")
+print("PL Time: ", str(t))
+
+result1 = acc0.read(0x10)
+result2 = acc1.read(0x10)
+result3 = acc2.read(0x10)
+result4 = acc3.read(0x10)
+result5 = acc4.read(0x10)
+result6 = acc5.read(0x10)
+result7 = acc6.read(0x10)
+
+print("Return value 1: ", result1)
+print("Return value 2: ", result2)
+print("Return value 3: ", result3)
+print("Return value 4: ", result4)
+print("Return value 5: ", result5)
+print("Return value 6: ", result6)
+print("Return value 7: ", result7)
+print("Number of triangles: ", result1+result2+result3+result4+result5+result6+result7)