From 78dedb64d2161d362e68ccffd1f26f5da8f7103c Mon Sep 17 00:00:00 2001 From: negargoli Date: Tue, 27 Nov 2018 17:59:31 -0800 Subject: [PATCH] make Debug PTX by adding relevant store after each instruction --- src/cuda-sim/cuda-sim.cc | 140 ++++++++++++++++++++++++++++++++++--- src/cuda-sim/ptx_ir.cc | 19 +++++ src/cuda-sim/ptx_ir.h | 21 +++++- src/cuda-sim/ptx_parser.cc | 108 +++++++++++++++++++++++++++- src/cuda-sim/ptx_parser.h | 9 +++ 5 files changed, 286 insertions(+), 11 deletions(-) diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc index 3773f6fdb..ccaaf8eac 100644 --- a/src/cuda-sim/cuda-sim.cc +++ b/src/cuda-sim/cuda-sim.cc @@ -68,6 +68,8 @@ char *opcode_latency_int, *opcode_latency_fp, *opcode_latency_dp,*opcode_latency char *opcode_initiation_int, *opcode_initiation_fp, *opcode_initiation_dp,*opcode_initiation_sfu,*opcode_initiation_tensor; char *cdp_latency_str; unsigned cdp_latency[5]; +const char *type_store; + void ptx_opcocde_latency_options (option_parser_t opp) { option_parser_register(opp, "-ptx_opcode_latency_int", OPT_CSTR, &opcode_latency_int, @@ -301,7 +303,6 @@ void function_info::ptx_assemble() m_n = n; printf(" done.\n"); fflush(stdout); - //disable pdom analysis here and do it at runtime #if 0 printf("GPGPU-Sim PTX: finding reconvergence points for \'%s\'...\n", m_name.c_str() ); @@ -998,17 +999,138 @@ void ptx_instruction::pre_decode() if ( o.is_reg() && !o.is_non_arch_reg() ) { out[0] = o.reg_num(); arch_reg.dst[0] = o.arch_reg_num(); + switch (get_type()) { + case S8_TYPE: type_store=".s8 "; break; + case S16_TYPE: type_store=".s16 "; break; + case S32_TYPE: type_store=".s32 "; break; + case S64_TYPE: type_store=".s64 "; break; + case U16_TYPE: type_store=".u16 "; break; + case U8_TYPE: type_store=".u8 "; break; + case U32_TYPE: type_store=".u32 "; break; + case U64_TYPE: type_store=".u64 "; break; + case F16_TYPE: type_store=".f16 "; break; + case F32_TYPE: type_store=".f32 "; break; + case F64_TYPE: type_store=".f64 "; break; + case B8_TYPE: type_store=".b8 "; break; + case B16_TYPE: type_store=".b16 "; break; + case B32_TYPE: type_store=".b32 "; break; + case B64_TYPE: type_store=".b64 "; break; + case PRED_TYPE:type_store=".pred "; break; + default: + type_store= "non-scalar type"; + break; + + } + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.name().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } } else if ( o.is_vector() ) { is_vectorin = 1; + switch (get_type()) { + case S8_TYPE: type_store=".s8 "; break; + case S16_TYPE: type_store=".s16 "; break; + case S32_TYPE: type_store=".s32 "; break; + case S64_TYPE: type_store=".s64 "; break; + case U16_TYPE: type_store=".u16 "; break; + case U8_TYPE: type_store=".u8 "; break; + case U32_TYPE: type_store=".u32 "; break; + case U64_TYPE: type_store=".u64 "; break; + case F16_TYPE: type_store=".f16 "; break; + case F32_TYPE: type_store=".f32 "; break; + case F64_TYPE: type_store=".f64 "; break; + case B8_TYPE: type_store=".b8 "; break; + case B16_TYPE: type_store=".b16 "; break; + case B32_TYPE: type_store=".b32 "; break; + case B64_TYPE: type_store=".b64 "; break; + case PRED_TYPE:type_store=".pred "; break; + default: + type_store= "non-scalar type"; + break; + } unsigned num_elem = o.get_vect_nelem(); - if( num_elem >= 1 ) out[0] = o.reg1_num(); - if( num_elem >= 2 ) out[1] = o.reg2_num(); - if( num_elem >= 3 ) out[2] = o.reg3_num(); - if( num_elem >= 4 ) out[3] = o.reg4_num(); - if( num_elem >= 5 ) out[4] = o.reg5_num(); - if( num_elem >= 6 ) out[5] = o.reg6_num(); - if( num_elem >= 7 ) out[6] = o.reg7_num(); - if( num_elem >= 8 ) out[7] = o.reg8_num(); + if( num_elem >= 1 ) + { + out[0] = o.reg1_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name1().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 2 ) + { + out[1] = o.reg2_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name2().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 3 ) + { + out[2] = o.reg3_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name3().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 4 ) + { + out[3] = o.reg4_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name4().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 5 ) + { + out[4] = o.reg5_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name5().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 6 ) + { + out[5] = o.reg6_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name6().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 7 ) + { + out[6] = o.reg7_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name7().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } + if( num_elem >= 8 ) + { + out[7] = o.reg8_num(); + if(inst_counter>argument_counter) + { + fprintf(ptxdebug," st.global"); + fprintf(ptxdebug,"%s [%rd%d] %s;\n",type_store,(store_counter+3),o.vec_name8().c_str()); + fprintf(ptxdebug," add.u64 %rd%d, %rd%d, 4;\n",(store_counter+3),(store_counter+3)); + } + } for (int i = 0; i < num_elem; i++) arch_reg.dst[i] = o.arch_reg_num(i); } diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc index 73db24d9e..5266240c6 100644 --- a/src/cuda-sim/ptx_ir.cc +++ b/src/cuda-sim/ptx_ir.cc @@ -614,6 +614,25 @@ void function_info::do_pdom() printf("GPGPU-Sim PTX: pre-decoding instructions for \'%s\'...\n", m_name.c_str() ); for ( unsigned ii=0; ii < m_n; ii += m_instr_mem[ii]->inst_size() ) { // handle branch instructions ptx_instruction *pI = m_instr_mem[ii]; + if((inst_counter-1)< argument_counter) + { + inst_counter++; + } + if((inst_counter-1) == argument_counter) + { + fprintf(ptxdebug," ld.param.u64 %rd%d, [%s_param_%d];\n",store_counter,m_name.c_str(),argument_counter); + fprintf(ptxdebug," cvta.to.global.u64 %rd%d, %rd%d;\n",(store_counter+1),(store_counter)); + fprintf(ptxdebug," move.u32 %r%d,tid.x;\n",(u32_counter)); + fprintf(ptxdebug," move.u32 %r%d,ctaid.x;\n",(u32_counter+1)); + fprintf(ptxdebug," move.u32 %r%d,ntid.x;\n",(u32_counter+2)); + fprintf(ptxdebug," mad.lo.s32 %r%d, %r%d, %r%d, %r%d;\n",(u32_counter+3),(u32_counter+2),(u32_counter+1),(u32_counter)); + fprintf(ptxdebug," mul.wide.s32 %rd%d,%r%d,4;\n",(store_counter+2),(u32_counter+3)); + fprintf(ptxdebug," add.s64 %rd%d,%rd%d,%rd%d;\n",(store_counter+3),(store_counter+2),(store_counter+1)); + inst_counter++; + } + fprintf(ptxdebug," %s \n", pI->get_source()); + if(!strcmp(pI->get_source(),"ret;")){ + fprintf(ptxdebug," }\n "); } pI->pre_decode(); } printf("GPGPU-Sim PTX: ... done pre-decoding instructions for \'%s\'.\n", m_name.c_str() ); diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h index 1af85de69..eba685abe 100644 --- a/src/cuda-sim/ptx_ir.h +++ b/src/cuda-sim/ptx_ir.h @@ -710,7 +710,26 @@ class operand_info { assert( m_type == vector_t); return m_value.m_vector_symbolic[3]->name(); } - + const std::string &vec_name5() const + { + assert( m_type == vector_t); + return m_value.m_vector_symbolic[4]->name(); + } + const std::string &vec_name6() const + { + assert( m_type == vector_t); + return m_value.m_vector_symbolic[5]->name(); + } + const std::string &vec_name7() const + { + assert( m_type == vector_t); + return m_value.m_vector_symbolic[6]->name(); + } + const std::string &vec_name8() const + { + assert( m_type == vector_t); + return m_value.m_vector_symbolic[7]->name(); + } bool is_reg() const { if ( m_type == reg_t ) { diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc index 25758dd85..8bed41ba6 100644 --- a/src/cuda-sim/ptx_parser.cc +++ b/src/cuda-sim/ptx_parser.cc @@ -43,8 +43,24 @@ void set_ptx_warp_size(const struct core_config * warp_size) static bool g_debug_ir_generation=false; const char *g_filename; +const char *PTX_DEBUG_filename; unsigned g_max_regs_per_thread = 0; +FILE *ptxdebug; +int argument_counter; +int identifier_counter; +int counter; +int variable_counter; +int inst_counter; +int store_counter; +int u32_counter; +const char* function_name; +const char* param_type; + + + + + // the program intermediate representation... static symbol_table *g_global_allfiles_symbol_table = NULL; static symbol_table *g_global_symbol_table = NULL; @@ -107,6 +123,8 @@ void read_parser_environment_variables() if ( debug_execution >= 30 ) g_debug_ir_generation=true; } + ptxdebug = fopen("PTX_AddStore_Debug.ptx", "w"); + fprintf(ptxdebug,"\n \\\\ Debug PTX, contain store after each instruction\n .version 6.0 \n .target sm_70 \n .address_size 64 \n"); } void init_directive_state() @@ -176,6 +194,12 @@ symbol_table *init_parser( const char *ptx_filename ) ptx_parse(); fclose(ptx_in); return g_global_symbol_table; + // std::string debug_filename = "PTX_DEBUG_"; + // debug_filename += g_filename; + // PTX_DEBUG_filename = debug_filename.c_str(); + // ptxdebug = fopen(PTX_DEBUG_filename, "w"); + // fprintf(ptxdebug,"\n \\\\ Debug PTX, contain store after each instruction\n .version 6.0 \n .target sm_70 \n .address_size 64 \n"); + } static int g_entry_point; @@ -188,6 +212,13 @@ void start_function( int entry_point ) g_entry_point = entry_point; g_func_info = NULL; g_entry_func_param_index=0; + argument_counter=0; + //PTX_DEBUG + inst_counter=0; + identifier_counter=0; + variable_counter=0; + u32_counter=0; + store_counter=0; } char *g_add_identifier_cached__identifier = NULL; @@ -211,6 +242,8 @@ void add_function_name( const char *name ) g_func_info->remove_args(); } g_global_symbol_table->add_function( g_func_info, g_filename, ptx_lineno ); + fprintf(ptxdebug,".visible .entry %s ( \n",name); + function_name= name; } //Jin: handle instruction group for cdp @@ -235,7 +268,8 @@ void add_directive() void end_function() { PTX_PARSE_DPRINTF("end_function"); - + + variable_counter=0; init_directive_state(); init_instruction_state(); g_max_regs_per_thread = mymax( g_max_regs_per_thread, (g_current_symbol_table->next_reg_num()-1)); @@ -326,7 +360,55 @@ void add_instruction() void add_variables() { + + if(variable_counter == 0 ){ + fprintf(ptxdebug," .param .u64 %s_param_%d\n", function_name, argument_counter); + fprintf(ptxdebug,")\n { \n"); + } + + variable_counter++; PTX_PARSE_DPRINTF("add_variables"); + + if(g_ptx_token_decode[g_scalar_type_spec] == "B64_TYPE") + { + store_counter=counter; + counter=counter+4; + fprintf(ptxdebug," .reg .b64 %%rd<%d> \n",counter); + } + else if (g_ptx_token_decode[g_scalar_type_spec] == "B32_TYPE") + { + u32_counter=counter; + counter=counter+4; + fprintf(ptxdebug," .reg .b32 %%r<%d> \n",counter); + } + else if (g_ptx_token_decode[g_scalar_type_spec] == "B16_TYPE") + fprintf(ptxdebug," .reg .b16 %%rs<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "B8_TYPE") + fprintf(ptxdebug," .reg .b8 %%rc<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "S64_TYPE") + fprintf(ptxdebug," .reg .b64 %%rd<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "S32_TYPE") + fprintf(ptxdebug," .reg .b32 %%r<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "S16_TYPE") + fprintf(ptxdebug," .reg .b16 %%rs<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "S8_TYPE") + fprintf(ptxdebug," .reg .b8 %%rc<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "U64_TYPE") + fprintf(ptxdebug," .reg .u64 %%rd<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "U32_TYPE") + fprintf(ptxdebug," .reg .u32 %%r<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "U16_TYPE") + fprintf(ptxdebug," .reg .u16 %%rs<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "U8_TYPE") + fprintf(ptxdebug," .reg .u8 %%rc<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "F64_TYPE") + fprintf(ptxdebug," .reg .f64 %%f<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "F32_TYPE") + fprintf(ptxdebug," .reg .f32 %%f<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "F16_TYPE") + fprintf(ptxdebug," .reg .f16 %%f<%d> \n",counter); + else if (g_ptx_token_decode[g_scalar_type_spec] == "PRED_TYPE") + fprintf(ptxdebug," .reg .pred %%p<%d> \n",counter); if ( !g_operands.empty() ) { assert( g_last_symbol != NULL ); g_last_symbol->add_initializer(g_operands); @@ -391,7 +473,9 @@ void add_identifier( const char *identifier, int array_dim, unsigned array_ident g_add_identifier_cached__array_ident = array_ident; return; } + counter++; PTX_PARSE_DPRINTF("add_identifier \"%s\" (%u)", identifier, g_ident_add_uid); + unsigned store=g_ident_add_uid+1; g_ident_add_uid++; type_info *type = g_var_type; type_info_key ti = type->get_key(); @@ -593,6 +677,25 @@ void add_function_arg() unsigned alignment = (g_alignment_spec==-1) ? g_size : g_alignment_spec; assert(alignment==1||alignment==2||alignment==4||alignment==8||alignment==16);//known valid alignment values g_func_info->add_config_param( g_size, alignment); + if(g_ptx_token_decode[g_scalar_type_spec]=="S8_TYPE") param_type =".s8 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="U64_TYPE") param_type =".u64 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="S16_TYPE") param_type =".s16 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="S32_TYPE") param_type =".s32 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="S64_TYPE") param_type =".s64 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="U16_TYPE") param_type =".u16 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="U8_TYPE") param_type =".u8 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="U32_TYPE") param_type =".u32 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="U64_TYPE") param_type =".u64 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="F16_TYPE") param_type =".f16 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="F32_TYPE") param_type =".f32 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="F64_TYPE") param_type =".f64 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="B8_TYPE") param_type =".b8 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="B16_TYPE") param_type =".b16 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="B32_TYPE") param_type =".b32 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="B64_TYPE") param_type =".b64 "; + if(g_ptx_token_decode[g_scalar_type_spec]=="PRED_TYPE") param_type =".pred "; + fprintf(ptxdebug," .param %s %s,\n",param_type, g_last_symbol->name().c_str()); + argument_counter++; } } @@ -680,6 +783,8 @@ void add_scalar_type_spec( int type_spec ) "only cvt, set, slct, tex and dp4a can have more than one type specifier."); } g_scalar_type_spec = type_spec; + identifier_counter++; + counter=0; } void add_label( const char *identifier ) @@ -977,6 +1082,7 @@ void add_address_operand( const char *identifier, int offset ) parse_error( msg.c_str() ); } g_operands.push_back( operand_info(s,offset) ); + PTX_PARSE_DPRINTF("%s and the offset is : %d" , identifier, offset); } void add_address_operand2( int offset ) diff --git a/src/cuda-sim/ptx_parser.h b/src/cuda-sim/ptx_parser.h index 7b6e3a25e..7a6db1f6b 100644 --- a/src/cuda-sim/ptx_parser.h +++ b/src/cuda-sim/ptx_parser.h @@ -32,6 +32,15 @@ extern const char *g_filename; extern int g_error_detected; +extern FILE *ptxdebug; +extern int argument_counter; +extern int inst_counter; +extern int identifier_counter; +extern int counter; +extern int store_counter; +extern int u32_counter; + + #ifdef __cplusplus class symbol_table* init_parser(const char*); const class ptx_instruction *ptx_instruction_lookup( const char *filename, unsigned linenumber );