accel-sim · yechen3 · Feb 14, 2018 · Feb 14, 2018 · Feb 14, 2018 · Feb 14, 2018
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,152 @@
+pipeline {
+    agent {
+        label "purdue-cluster"
+    }
+
+    options {
+        disableConcurrentBuilds()
+    }
+    stages {
+        /*
+        stage('formatting-check') {
+          steps {
+            sh '''
+              source ./env-setup/common/export_gcc_version.sh 5.3.0
+              git remote add upstream https://github.com/purdue-aalp/gpgpu-sim_distribution
+              git fetch upstream
+              if git diff --name-only upstream/dev | grep -E "*.cc|*.h|*.cpp|*.hpp" ; then
+                git diff --name-only upstream/dev | grep -E "*.cc|*.h|*.cpp|*.hpp" | xargs ./run-clang-format.py --clang-format-executable /home/tgrogers-raid/a/common/clang-format/6.0.1/clang-format
+              fi
+            ''' 
+          }
+        }
+        */
+        stage('env-setup') {
+            steps {
+                sh 'rm -rf env-setup && git clone [email protected]:purdue-aalp/env-setup.git &&\
+                    cd env-setup && git checkout cluster-ubuntu'
+            }
+        }
+        stage('simulator-build') {
+            steps {
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    make -j 10'''
+            }
+        }
+        stage('simulations-build'){
+            steps{
+                sh 'rm -rf gpgpu-sim_simulations'
+                sh 'git clone [email protected]:purdue-aalp/gpgpu-sim_simulations.git && \
+                    cd gpgpu-sim_simulations && \
+                    git pull && \
+                    ln -s /home/tgrogers-raid/a/common/data_dirs benchmarks/'
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    cd gpgpu-sim_simulations
+                    source ./benchmarks/src/setup_environment
+                    make -j 10 -C ./benchmarks/src/ rodinia_2.0-ft
+                    make -C ./benchmarks/src data'''
+            }
+        }
+        stage('11.0 UVM Regressions'){
+            steps {
+                    sh '''#!/bin/bash
+                        source ./env-setup/11.0_env_setup.sh
+                        source `pwd`/setup_environment
+                        ./gpgpu-sim_simulations/util/job_launching/run_simulations.py -B rodinia_2.0-ft -C GTX1080Ti_UVM -N regress-UVM-$$ 
+                        PLOTDIR="jenkins/${JOB_NAME}/${BUILD_NUMBER}/11.0" && ssh [email protected] mkdir -p /home/dynamo/a/tgrogers/website/gpgpu-sim-plots/$PLOTDIR
+                        ./gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -s stats-per-app-11.0.csv -N regress-UVM-$$'''
+            }
+        }
+        stage('11.0 Regular Regressions'){
+            steps {
+                    sh '''#!/bin/bash
+                        source ./env-setup/11.0_env_setup.sh
+                        source `pwd`/setup_environment
+                        ./gpgpu-sim_simulations/util/job_launching/run_simulations.py -B rodinia_2.0-ft -C QV100 -N regress-$$ 
+                        PLOTDIR="jenkins/${JOB_NAME}/${BUILD_NUMBER}/11.0" && ssh [email protected] mkdir -p /home/dynamo/a/tgrogers/website/gpgpu-sim-plots/$PLOTDIR
+                        ./gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -s stats-per-app-11.0.csv -N regress-$$'''
+            }
+        }
+        stage('correlate-delta-and-archive') {
+            steps {
+                    sh './gpgpu-sim_simulations/run_hw/get_hw_data.sh'
+                    sh 'rm -rf ./gpgpu-sim_simulations/util/plotting/correl-html && rm -rf gpgpu-sim-results-repo && rm -rf ./gpgpu-sim_simulations/util/plotting/htmls'
+                    sh 'git clone [email protected]:purdue-aalp/gpgpu-sim-results-repo.git'
+                    sh '''#!/bin/bash
+                        source ./env-setup/11.0_env_setup.sh
+                        ./gpgpu-sim_simulations/util/job_launching/get_stats.py -R -K -k -B rodinia_2.0-ft -C QV100 -A > stats-per-kernel-11.0.csv'''
+                    sh 'if [ ! -d ./gpgpu-sim-results-repo/${JOB_NAME} ]; then mkdir -p ./gpgpu-sim-results-repo/${JOB_NAME}/ ; cp ./gpgpu-sim-results-repo/purdue-aalp/gpgpu-sim_distribution/dev/* ./gpgpu-sim-results-repo/${JOB_NAME}/ ; fi'
+                    sh './gpgpu-sim_simulations/util/plotting/merge-stats.py -c ./gpgpu-sim-results-repo/${JOB_NAME}/stats-per-app-11.0.csv,./stats-per-app-11.0.csv -R > per-app-merge-11.0.csv'
+                    sh 'PLOTDIR="jenkins/${JOB_NAME}" &&\
+                        ./gpgpu-sim_simulations/util/plotting/plot-get-stats.py -c per-app-merge-11.0.csv -P cuda-11.0 &&\
+                        ./gpgpu-sim_simulations/util/plotting/merge-stats.py -c ./gpgpu-sim-results-repo/${JOB_NAME}/stats-per-kernel-11.0.csv,./stats-per-kernel-11.0.csv -R > per-kernel-merge-11.0.csv &&\
+                        ./gpgpu-sim_simulations/util/plotting/plot-correlation.py -H ./gpgpu-sim_simulations/run_hw/QUADRO-V100/device-0/9.1/ -c per-kernel-merge-11.0.csv -p cuda-11.0 | grep -B 1 "Correl=" | tee correl.11.0.txt &&\
+                        mkdir -p ./gpgpu-sim-results-repo/${JOB_NAME}/ && cp stats-per-*.csv ./gpgpu-sim-results-repo/${JOB_NAME}/ &&\
+                        cd ./gpgpu-sim-results-repo &&\
+                        git diff --quiet && git diff --staged --quiet || git commit -am "Jenkins automated checkin ${JOB_NAME} Build:${BUILD_NUMBER}" &&\
+                        git push'
+
+                    sh 'PLOTDIR="/home/dynamo/a/tgrogers/website/gpgpu-sim-plots/jenkins/${JOB_NAME}" &&\
+                        ssh [email protected] mkdir -p $PLOTDIR/${BUILD_NUMBER} && \
+                        scp  ./gpgpu-sim_simulations/util/plotting/correl-html/* [email protected]:$PLOTDIR/${BUILD_NUMBER} &&\
+                        scp  ./gpgpu-sim_simulations/util/plotting/htmls/* [email protected]:$PLOTDIR/${BUILD_NUMBER} &&\
+                        ssh [email protected] "cd $PLOTDIR && rm -rf latest && cp -r ${BUILD_NUMBER} latest"'
+            }
+        }
+        stage('sst-core-build') {
+            steps {
+                sh 'rm -rf sstcore-install'
+                sh 'rm -rf sst-core && git clone [email protected]:sstsimulator/sst-core.git'
+                sh '''#!/bin/bash
+                    cd sst-core
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst-elements-build') {
+            steps {
+                sh 'rm -rf sstelements-install'
+                sh 'rm -rf sst-elements && git clone [email protected]:sstsimulator/sst-elements.git'
+                // First sourcing the env_setup and setup_environment script for env vars
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    cd sst-elements
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst balar test') {
+            steps {
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment sst
+                    ./sstcore-install/bin/sst-test-elements -p ./sst-elements/src/sst/elements/balar/tests'''
+            }
+        }
+    }
+    post {
+        success {
+            emailext body: "See ${BUILD_URL}.",
+                recipientProviders: [[$class: 'CulpritsRecipientProvider'],
+                    [$class: 'RequesterRecipientProvider']],
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - Success!",
+                to: '[email protected]'
+        }
+        failure {
+            emailext body: "See ${BUILD_URL}",
+                recipientProviders: [[$class: 'CulpritsRecipientProvider'],
+                    [$class: 'RequesterRecipientProvider']],
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - ${currentBuild.result}",
+                to: '[email protected]'
+        }
+    }
+}
diff --git a/README.md b/README.md
@@ -57,6 +57,18 @@ Complex Dynamics in Many-Core Accelerator Architectures, In Proceedings of the
 IEEE International Symposium on Performance Analysis of Systems and Software
 (ISPASS), pp. 164-174, White Plains, NY, March 28-30, 2010.
 
+If you use prefetchers and page eviction policies, please cite:
+
+Debashis Ganguly, Ziyu Zhang, Jun Yang, and Rami Melhem, Interplay between hardware prefetcher and page eviction policy in CPU-GPU unified virtual memory, In Proceedings of the 46th International Symposium on Computer Architecture (ISCA '19), New York, NY, USA, 2019.
+
+If you use access counter-based delayed migration, LFU eviction, cold vs hot data structure classification, and page migration and pinning, please cite:
+
+Debashis Ganguly, Ziyu Zhang, Jun Yang, and Rami Melhem, Adaptive Page Migration for Irregular Data-intensive Applications under GPU Memory Oversubscription, In Proceedings of the 34th IEEE International Parallel & Distributed Processing Symposium (IPDPS 2020), New Orleans, Louisiana, USA, 2020.
+
+If you use adaptive runtime to detect pattern in CPU-GPU interconnect traffic, and policy engine to choose and dynamically employ memory management policies, please cite:
+
+Debashis Ganguly, Rami Melhem, and Jun Yang, An Adaptive Framework for Oversubscription Management in CPU-GPU Unified Memory, In 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE 2021).
+
 This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
@@ -0,0 +1,15 @@
+# This is a sample build configuration for C++  Make.
+# Check our guides at https://confluence.atlassian.com/x/5Q4SMw for more examples.
+# Only use spaces to indent your .yml configuration.
+# -----
+# You can specify a custom docker image from Docker Hub as your build environment.
+image: tgrogers/gpgpu-sim_regress:latest
+
+pipelines:
+  default:
+    - step:
+        script: # Modify the commands below to build your repository.
+          - docker run -v `pwd`:/home/runner/gpgpu-sim_distribution:rw tgrogers/gpgpu-sim_regress:latest /bin/bash -c "./start_torque.sh; chown -R runner /home/runner/gpgpu-sim_distribution; su - runner -c 'source /home/runner/gpgpu-sim_distribution/setup_environment && make -j -C /home/runner/gpgpu-sim_distribution && cd /home/runner/gpgpu-sim_simulations/ && git pull && /home/runner/gpgpu-sim_simulations/util/job_launching/run_simulations.py -c /home/runner/gpgpu-sim_simulations/util/job_launching/regression_recipies/rodinia_2.0-ft/configs.gtx1080ti.yml -N regress && /home/runner/gpgpu-sim_simulations/util/job_launching/monitor_func_test.py -v -N regress'"
+        services:
+          - docker
+
diff --git a/configs/GTX480/config_fermi_islip.icnt b/configs/GTX480/config_fermi_islip.icnt
@@ -0,0 +1,70 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 32; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 27;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 8;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 2;
+output_speedup    = 1;
+internal_speedup  = 1.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/GTX480/gpgpusim.config b/configs/GTX480/gpgpusim.config
@@ -0,0 +1,133 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 20 
+
+
+# SASS execution (only supported with CUDA >= 4.0)
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 15
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 6
+-gpgpu_n_sub_partition_per_mchannel 2 
+
+# Fermi clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+# In Fermi, each pipeline has 16 execution units, so the Core clock needs to be divided
+# by 2. (GPGPU-Sim simulates a warp (32 threads) in a single cycle). 1400/2 = 700
+-gpgpu_clock_domains 700.0:700.0:700.0:924.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 32768
+
+# This implies a maximum of 48 warps/SM
+-gpgpu_shader_core_pipeline 1536:32 
+-gpgpu_shader_cta 8
+-gpgpu_simd_model 1 
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
+-gpgpu_pipeline_widths 2,1,1,2,1,1,2
+-gpgpu_num_sp_units 2
+-gpgpu_num_sfu_units 1
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+-ptx_opcode_latency_int 4,13,4,5,145
+-ptx_opcode_initiation_int 1,2,2,1,8
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 1,2,1,1,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 8,16,8,8,130
+
+
+# In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
+-gpgpu_cache:dl1  32:128:4,L:L:m:N:H,A:32:8,8
+-gpgpu_shmem_size 49152
+
+# The alternative configuration for fermi in case cudaFuncCachePreferL1 is selected
+#-gpgpu_cache:dl1  64:128:6,L:L:m:N:H,A:32:8,8
+#-gpgpu_shmem_size 16384
+
+# 64 sets, each 128 bytes 8-way for each memory sub partition. This gives 786KB L2 cache
+-gpgpu_cache:dl2 64:128:8,L:B:m:W:L,A:32:4,4:0,32
+-gpgpu_cache:dl2_texture_only 0 
+
+-gpgpu_cache:il1 4:128:4,L:R:f:N:L,A:2:32,4
+-gpgpu_tex_cache:l1 4:128:24,L:R:m:N:L,F:128:4,128:2
+-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4
+
+# enable operand collector 
+-gpgpu_operand_collector_num_units_sp 6
+-gpgpu_operand_collector_num_units_sfu 8
+-gpgpu_operand_collector_num_in_ports_sp 2
+-gpgpu_operand_collector_num_out_ports_sp 2
+-gpgpu_num_reg_banks 16
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+
+-gpgpu_max_insn_issue_per_warp 1
+
+# interconnection
+-network_mode 1 
+-inter_config_file config_fermi_islip.icnt
+
+# memory partition latency config 
+-rop_latency 120
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+# The DRAM return queue and the scheduler queue together should provide buffer
+# to sustain the memory level parallelism to tolerate DRAM latency 
+# To allow 100% DRAM utility, there should at least be enough buffer to sustain
+# the minimum DRAM latency (100 core cycles).  I.e. 
+#   Total buffer space required = 100 x 924MHz / 700MHz = 132
+-gpgpu_frfcfs_dram_sched_queue_size 16
+-gpgpu_dram_return_queue_size 116
+
+# for Fermi, bus width is 384bits, this is 8 bytes (4 bytes at each DRAM chip) per memory partition
+-gpgpu_n_mem_per_ctrlr 2
+-gpgpu_dram_buswidth 4
+-gpgpu_dram_burst_length 8
+-dram_data_command_freq_ratio 4  # GDDR5 is QDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS
+
+# GDDR5 timing from hynix H5GQ1H24AFR
+# to disable bank groups, set nbkgrp to 1 and tCCDL and tRTPL to 0
+-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
+
+# Fermi has two schedulers per core
+-gpgpu_num_sched_per_core 2
+# Two Level Scheduler with active and pending pools
+#-gpgpu_scheduler two_level_active:6:0:1
+# Loose round robbin scheduler
+#-gpgpu_scheduler lrr
+# Greedy then oldest scheduler
+-gpgpu_scheduler gto
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs
+-power_simulation_enabled 1
+-gpuwattch_xml_file gpuwattch_gtx480.xml
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0