diff --git a/README.md b/README.md index 073117a..812d96e 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,126 @@ -# Strobelight +# BPF GPUEventSnoop with LLM-based CUDA Trace Analysis -![Strobelight Logo](images/Strobelight_brandmark_full-color-black-text.svg) -Strobelight is a fleetwide profiler framework developed by Meta, designed to provide comprehensive profiling capabilities across large-scale infrastructure. This framework helps in identifying performance bottlenecks and optimizing resource utilization across a fleet of machines. +Traces CUDA GPU kernel functions via BPF and provides in-depth analysis through visualizations and optional LLM (Large Language Model)-powered summaries. -Strobelight is composed of a number of profilers, each profiler collects a certain type of profile. This can include CPU, GPU, Memory, or other types of profiles. +--- -## gpuevent profiler -The `gpuevent` profiler attaches to `cudaLaunchKernel` events and collects information about kernels being launched, including demangled name, arguments, stacks, dimensions, etc. +## πŸš€ Prerequisites -## Getting Started +- NVIDIA GPU instance +- Ubuntu with kernel headers +- CUDA Toolkit installed (`nvcc` should be at `/usr/local/cuda/bin/nvcc`) -### Prerequisites +--- -- A Linux-based system. -- Gpu host with [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) -- Cuda binary for testing -- cmake +## πŸ”§ Install Required Packages -### Installation +```bash +sudo apt update +sudo apt install -y clang llvm libbpf-dev +sudo apt install -y linux-headers-$(uname -r) +sudo apt install -y build-essential git cmake libelf-dev libfl-dev pkg-config -1. Clone the repository: - ```bash - git clone https://github.com/facebookincubator/strobelight.git - ``` +πŸ› οΈ Build Strobelight and GPUEventSnoop +cd strobelight +./scripts/build.sh -2. Navigate to the project directory and follow the build instructions: - ```bash - cd strobelight - ./scripts/build.sh -u - ``` +- BPF source for user and kernel prog is located in: strobelight/strobelight/src/profilers/gpuevent_snoop +- After a successful build, binaries will be dumped in: strobelight/strobelight/src/_build/profilers -### Usage +πŸ§ͺ Run the Profiler +cd strobelight/strobelight/src/_build/profilers +./gpuevent_snoop -p +./gpuevent_snoop --help # For all options -Once build is done, you can run the generated binary on any cuda pid - ```bash - $ strobelight/src/_build/profilers/gpuevent_snoop --help -Usage: gpuevent_snoop [OPTION...] -GpuEventSnoop. +Supported CUDA routines traced: +- cudaMalloc, cudaFree +- cudaMemcpy, cudaMemcpyAsync +- cudaLaunchKernel +- cudaStreamCreate, cudaStreamDestroy +- cudaStreamSynchronize, cudaDeviceSynchronize -Traces GPU kernel function execution and its input parameters +🧫 Test Programs and Tracing -USAGE: ./gpuevent_snoop -p PID [-v] [-d duration_sec] +Build a Sample Program +/usr/local/cuda/bin/nvcc test_cuda_api_multi_gpu.cu -o test_cuda_api_multi_gpu - -a, --args Collect Kernel Launch Arguments - -d, --duration=SEC Trace for given number of seconds - -p, --pid=PID Trace process with given PID - -r, --rb-count=CNT RingBuf max entries - -s, --stacks Collect Kernel Launch Stacks - -v, --verbose Verbose debug output - -?, --help Give this help list - --usage Give a short usage message - ``` +Run and Collect Trace - ```bash -./gpuevent_snoop -p -a -s -Found Symbol cudaLaunchKernel at /strobelight/oss/src/cuda_example/__cuda_kernel_example__/cuda_kernel_example Offset: 0xca480 -Started profiling at Thu Apr 4 13:20:28 2024 -cuda_kernel_exa [4024506] KERNEL [0x269710] STREAM 0x0 GRID (1,1,1) BLOCK (256,1,1) add_vectors(double*, double*, do... -Args: add_vectors arg0=0x7f2096800000 -double arg1=0x7f2096800400 -double arg2=0x7f2096800800 -double arg3=0x100000064 -int arg4=0x7ffc2a866690 -Stack: -00000000002cb480: cudaLaunchKernel @ 0x2cb480+0x0 -000000000026a050: main @ 0x26a050+0x912 -000000000002c5f0: libc_start_call_main @ 0x2c5f0+0x67 -000000000002c690: libc_start_main_alias_2 @ 0x2c690+0x88 -0000000000269330: _start @ 0x269330+0x21 +Update demo collect_trace.sh with the correct $REPO path. Script will run the above cuda program and trace it -... +$ ./collect_trace.sh > sample-trace.out +A sample trace file (sample-trace.out) is provided for testing. - ``` -## Contributing +πŸ“Š CUDA Trace Analysis (LLM-Enhanced) -We welcome contributions from the community. To contribute: +This toolset parses and analyzes CUDA traces using visualizations and LLMs like OpenAI GPT. -1. Fork the repository and create a new branch from `main`. -2. Make your changes, ensuring they adhere to the project's coding standards. -3. Submit a pull request, including a detailed description of your changes. +βΈ» -For more information, please refer to the [Contributing Guide](https://github.com/facebookincubator/strobelight/blob/main/CONTRIBUTING.md). +πŸ”§ Setup Environment -## License +cd llm-analysis +python3 -m venv venv && source venv/bin/activate +pip install -r requirements.txt -Strobelight is licensed under the Apache License, Version 2.0. See the [LICENSE](https://github.com/facebookincubator/strobelight/blob/main/LICENSE) file for more details. +πŸ“ˆ Run Analysis + +With OpenAI API key:: +python ./enhanced_cuda_trace_analysis.py trace.out --llm_mode openai --api_key YOUR_API_KEY + +Without API key (mock mode): +python ./enhanced_cuda_trace_analysis.py trace.out --llm_mode mock + +Results will be stored in the cuda_analysis_results folder. + + +🧰 Command Line Options +- --trace_file: Path to the CUDA trace file (required) +- --output_dir: Output folder (default: ./cuda_analysis_results) +- --llm_mode: LLM mode (mock, openai, local; default: mock) +- --api_key: OpenAI API key (required for openai mode) +- --model_endpoint: Local LLM API endpoint (default: http://localhost:8000) +- --skip_parsing: Skip trace parsing +- --skip_analysis: Skip trace data analysis +- --skip_visualization: Skip visualization generation +- --test_llm: Run LLM test suite + +πŸ“‚ Output Artifacts +- Parsed trace data (JSON) +- Analysis results (JSON) +- Visualizations (PNG) +- Enhanced dashboards +- LLM analysis (Markdown, HTML) +- Final summary reports + +Sample outputs: +- llm-sample-results/ +- sample_llm_analysis_report.html + + +. +β”œβ”€β”€ strobelight/ # Strobelight GPU profiler +β”œβ”€β”€ llm-analysis/ # LLM analysis tools +β”œβ”€β”€ collect_trace.sh # CUDA trace demo script +β”œβ”€β”€ test_cuda_api_multi_gpu.cu # demo CUDA program for tracing +β”œβ”€β”€ sample_llm_analysis_report.html # Example output +β”œβ”€β”€ llm-sample-results/ # Example LLM results +└── README.md + + +🧠 Components in llm-analysis/ +- cuda_trace_parser.py – Parses trace data +- cuda_trace_analyzer.py – Analyzes kernel launches +- cuda_visualization_organizer.py – Generates visualizations +- enhanced_cuda_llm_analyzer.py – Performs LLM analysis +- enhanced_cuda_trace_analysis.py – CLI wrapper +- cuda_prompt_templates.py – Prompt templates for LLMs +- cuda_llm_analysis_tester.py – LLM test suite + +πŸ“¬ Feedback & Contributions + +Feedback and contributions are welcome! +Please open an issue or pull request to help improve this GPU tracing and analysis framework. -## Acknowledgements -This project is maintained by Meta's engineering team and is open to community contributions. We thank all contributors for their efforts in improving this project. diff --git a/collect_trace.sh b/collect_trace.sh new file mode 100755 index 0000000..65ff507 --- /dev/null +++ b/collect_trace.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Set variables +CUDA_PROGRAM="test_cuda_api_multi_gpu.cu" # src code of the cuda program that will be traced +EXECUTABLE="test_cuda_api_multi_gpu" # Executable of the cuda program that will be traced +REPO="/efs/NFLX-GENAI-PROJECTS/GPUSNOOP" # full path of the directory where repository is cloned +TRACE_DURATION=30 # Run gpuevent_snoop for 30 seconds +GPU_EVENTSNOOP="$REPO/strobelight/strobelight/src/_build/profilers/gpuevent_snoop" # BPF user program + +# Step 1: Compile the CUDA Program +echo "Compiling $CUDA_PROGRAM..." +/usr/local/cuda/bin/nvcc $CUDA_PROGRAM -o $EXECUTABLE +if [ $? -ne 0 ]; then + echo "Compilation failed!" + exit 1 +fi +echo "Compilation successful" + +# Step 2: Run the Program in the Background +echo "Starting $EXECUTABLE..." +./$EXECUTABLE & +CUDA_PID=$! + +# Give some time for the process to start +sleep 3 + +# Step 3: Verify the Process is Running +if ! ps -p $CUDA_PID > /dev/null; then + echo "Error: CUDA process ($CUDA_PID) is not running!" + exit 1 +fi +echo "CUDA process running with PID: $CUDA_PID" + +# Step 4: Run gpuevent_snoop for 30 Seconds +echo "Running gpuevent_snoop for $TRACE_DURATION seconds..." +sudo $GPU_EVENTSNOOP -p $CUDA_PID -a -s -v --args --duration=$TRACE_DURATION + +# Step 5: Kill the CUDA Program After Tracing (Optional) +echo "Stopping CUDA program..." +kill $CUDA_PID + +echo "Tracing completed." + diff --git a/llm-analysis/README.md b/llm-analysis/README.md new file mode 100644 index 0000000..9359c2b --- /dev/null +++ b/llm-analysis/README.md @@ -0,0 +1,52 @@ +# CUDA Trace Analysis Tool + +This tool analyzes CUDA trace files generated by BPF programs, providing detailed insights through visualizations and LLM-based analysis. + +## Installation + +```bash +pip install -r requirements.txt +``` + +## Usage + +```bash +# Basic usage +python enhanced_cuda_trace_analysis.py /path/to/trace.out --output_dir ./output_directory --llm_mode mock + +# With OpenAI API for enhanced analysis +python enhanced_cuda_trace_analysis.py /path/to/trace.out --output_dir ./output_directory --llm_mode openai --api_key YOUR_API_KEY +``` + +## Command Line Options + +- `trace_file`: Path to the CUDA trace file (required) +- `--output_dir`: Output directory for analysis results (default: ./cuda_analysis_results) +- `--llm_mode`: LLM analysis mode (choices: mock, openai, local; default: mock) +- `--api_key`: OpenAI API key (required for openai mode) +- `--model_endpoint`: Local LLM API endpoint (for local mode; default: http://localhost:8000/v1/chat/completions) +- `--skip_parsing`: Skip trace file parsing (use existing parsed data) +- `--skip_analysis`: Skip trace data analysis (use existing analysis results) +- `--skip_visualization`: Skip visualization enhancement (use existing enhanced visualizations) +- `--test_llm`: Test LLM analysis using the testing framework + +## Output + +The tool generates the following outputs in the specified output directory: +- Parsed trace data (JSON) +- Analysis results (JSON) +- Visualizations (PNG) +- Enhanced visualizations and dashboard +- LLM analysis reports (Markdown) +- HTML report with integrated visualizations and analysis +- Final summary report (Markdown) + +## Components + +- `cuda_trace_parser.py`: Parses CUDA trace files into structured data +- `cuda_trace_analyzer.py`: Analyzes trace data and generates visualizations +- `cuda_visualization_organizer.py`: Enhances visualizations and creates dashboard +- `enhanced_cuda_llm_analyzer.py`: Performs LLM-based analysis of trace data +- `enhanced_cuda_trace_analysis.py`: Main program integrating all components +- `cuda_prompt_templates.py`: Templates for LLM prompts +- `cuda_llm_analysis_tester.py`: Testing framework for LLM analysis diff --git a/llm-analysis/cuda_llm_analysis_tester.py b/llm-analysis/cuda_llm_analysis_tester.py new file mode 100755 index 0000000..8b9241b --- /dev/null +++ b/llm-analysis/cuda_llm_analysis_tester.py @@ -0,0 +1,656 @@ +#!/usr/bin/env python3 +""" +CUDA Trace LLM Analysis Testing Framework - Test and evaluate LLM analysis quality +""" + +import os +import json +import argparse +import time +import re +from datetime import datetime +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from PIL import Image +import io +import base64 +import requests + +from enhanced_cuda_llm_analyzer import EnhancedCUDATraceLLMAnalyzer +from cuda_prompt_templates import CUDAPromptTemplates + +class CUDALLMAnalysisTester: + """Test and evaluate LLM analysis quality for CUDA trace data""" + + def __init__(self, analysis_dir, enhanced_dir=None, output_dir=None): + """Initialize the tester with the path to the analysis results directory""" + self.analysis_dir = analysis_dir + self.enhanced_dir = enhanced_dir or os.path.join(analysis_dir, "enhanced") + self.output_dir = output_dir or os.path.join(analysis_dir, "test_results") + + # Create output directory if it doesn't exist + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + + # Initialize the analyzer and prompt templates + self.analyzer = EnhancedCUDATraceLLMAnalyzer(analysis_dir, enhanced_dir) + self.prompt_templates = CUDAPromptTemplates() + + # Load analysis results + self.analysis_results = self._load_analysis_results() + self.summary = self._load_summary() + + def _load_analysis_results(self): + """Load the analysis results from JSON file""" + results_path = os.path.join(self.analysis_dir, "analysis_results.json") + if os.path.exists(results_path): + with open(results_path, 'r') as f: + return json.load(f) + return {} + + def _load_summary(self): + """Load the analysis summary from JSON file""" + summary_path = os.path.join(self.analysis_dir, "analysis_summary.json") + if os.path.exists(summary_path): + with open(summary_path, 'r') as f: + return json.load(f) + return {} + + def test_mock_analysis(self): + """Test the mock analysis functionality""" + print("Testing mock analysis...") + + start_time = time.time() + section_analyses = self.analyzer.generate_mock_analysis() + end_time = time.time() + + test_results = { + "test_name": "mock_analysis", + "execution_time": end_time - start_time, + "sections_analyzed": list(section_analyses.keys()), + "section_word_counts": {section: len(analysis.split()) for section, analysis in section_analyses.items()}, + "total_word_count": sum(len(analysis.split()) for analysis in section_analyses.values()), + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # Save test results + output_path = os.path.join(self.output_dir, "mock_analysis_test_results.json") + with open(output_path, 'w') as f: + json.dump(test_results, f, indent=2) + + print(f"Mock analysis test completed in {test_results['execution_time']:.2f} seconds") + print(f"Total word count: {test_results['total_word_count']}") + print(f"Test results saved to {output_path}") + + return test_results + + def test_openai_analysis(self, api_key=None): + """Test the OpenAI analysis functionality""" + if not api_key: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("Error: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or provide it as an argument.") + return None + + print("Testing OpenAI analysis...") + + start_time = time.time() + section_analyses = self.analyzer.analyze_with_openai(api_key) + end_time = time.time() + + if not section_analyses: + print("Error: OpenAI analysis failed.") + return None + + test_results = { + "test_name": "openai_analysis", + "execution_time": end_time - start_time, + "sections_analyzed": list(section_analyses.keys()), + "section_word_counts": {section: len(analysis.split()) for section, analysis in section_analyses.items()}, + "total_word_count": sum(len(analysis.split()) for analysis in section_analyses.values()), + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # Save test results + output_path = os.path.join(self.output_dir, "openai_analysis_test_results.json") + with open(output_path, 'w') as f: + json.dump(test_results, f, indent=2) + + print(f"OpenAI analysis test completed in {test_results['execution_time']:.2f} seconds") + print(f"Total word count: {test_results['total_word_count']}") + print(f"Test results saved to {output_path}") + + # Evaluate the quality of the analysis + quality_metrics = self.evaluate_analysis_quality(section_analyses) + + # Save quality metrics + quality_path = os.path.join(self.output_dir, "openai_analysis_quality_metrics.json") + with open(quality_path, 'w') as f: + json.dump(quality_metrics, f, indent=2) + + print(f"Analysis quality metrics saved to {quality_path}") + + return test_results, quality_metrics + + def test_local_llm_analysis(self, model_endpoint="http://localhost:8000/v1/chat/completions"): + """Test the local LLM analysis functionality""" + print(f"Testing local LLM analysis with endpoint {model_endpoint}...") + + start_time = time.time() + section_analyses = self.analyzer.analyze_with_local_llm(model_endpoint) + end_time = time.time() + + if not section_analyses: + print("Error: Local LLM analysis failed.") + return None + + test_results = { + "test_name": "local_llm_analysis", + "execution_time": end_time - start_time, + "sections_analyzed": list(section_analyses.keys()), + "section_word_counts": {section: len(analysis.split()) for section, analysis in section_analyses.items()}, + "total_word_count": sum(len(analysis.split()) for analysis in section_analyses.values()), + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # Save test results + output_path = os.path.join(self.output_dir, "local_llm_analysis_test_results.json") + with open(output_path, 'w') as f: + json.dump(test_results, f, indent=2) + + print(f"Local LLM analysis test completed in {test_results['execution_time']:.2f} seconds") + print(f"Total word count: {test_results['total_word_count']}") + print(f"Test results saved to {output_path}") + + # Evaluate the quality of the analysis + quality_metrics = self.evaluate_analysis_quality(section_analyses) + + # Save quality metrics + quality_path = os.path.join(self.output_dir, "local_llm_analysis_quality_metrics.json") + with open(quality_path, 'w') as f: + json.dump(quality_metrics, f, indent=2) + + print(f"Analysis quality metrics saved to {quality_path}") + + return test_results, quality_metrics + + def evaluate_analysis_quality(self, section_analyses): + """Evaluate the quality of the LLM analysis""" + print("Evaluating analysis quality...") + + quality_metrics = { + "section_metrics": {}, + "overall_metrics": {} + } + + # Define expected content patterns for each section + expected_patterns = { + "overview": [ + r"application characteristics", + r"key patterns", + r"performance considerations", + r"implementation quality" + ], + "api_distribution": [ + r"most frequent(ly)?\s+used API", + r"balance between", + r"operation types", + r"inefficient", + r"recommendations" + ], + "memory_operations": [ + r"memory transfer patterns", + r"efficiency", + r"balance between", + r"bottlenecks", + r"recommendations" + ], + "kernel_launches": [ + r"kernel launch patterns", + r"grid and block dimensions", + r"occupancy", + r"efficiency", + r"recommendations" + ], + "performance_bottlenecks": [ + r"bottleneck", + r"impact", + r"root cause", + r"recommendations", + r"performance gains" + ], + "optimization_recommendations": [ + r"recommendation", + r"implementation", + r"code example", + r"expected impact", + r"performance improvement" + ] + } + + # Check for code examples in optimization recommendations + code_pattern = r"```(cuda|c\+\+)?\s.*?```" + + # Evaluate each section + for section, analysis in section_analyses.items(): + if section not in expected_patterns: + continue + + section_metrics = { + "word_count": len(analysis.split()), + "pattern_matches": {} + } + + # Check for expected patterns + for pattern in expected_patterns[section]: + matches = re.findall(pattern, analysis.lower()) + section_metrics["pattern_matches"][pattern] = len(matches) + + # Calculate pattern coverage + total_patterns = len(expected_patterns[section]) + matched_patterns = sum(1 for count in section_metrics["pattern_matches"].values() if count > 0) + section_metrics["pattern_coverage"] = matched_patterns / total_patterns if total_patterns > 0 else 0 + + # Check for code examples in optimization recommendations + if section == "optimization_recommendations": + code_matches = re.findall(code_pattern, analysis, re.DOTALL) + section_metrics["code_examples"] = len(code_matches) + + quality_metrics["section_metrics"][section] = section_metrics + + # Calculate overall metrics + total_word_count = sum(metrics["word_count"] for metrics in quality_metrics["section_metrics"].values()) + avg_pattern_coverage = sum(metrics["pattern_coverage"] for metrics in quality_metrics["section_metrics"].values()) / len(quality_metrics["section_metrics"]) + + quality_metrics["overall_metrics"] = { + "total_word_count": total_word_count, + "average_pattern_coverage": avg_pattern_coverage, + "sections_analyzed": len(quality_metrics["section_metrics"]), + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + return quality_metrics + + def compare_analysis_modes(self, test_results_list): + """Compare different analysis modes based on test results""" + print("Comparing analysis modes...") + + if len(test_results_list) < 2: + print("Error: At least two test results are required for comparison.") + return None + + # Extract data for comparison + modes = [results["test_name"] for results in test_results_list] + execution_times = [results["execution_time"] for results in test_results_list] + word_counts = [results["total_word_count"] for results in test_results_list] + + # Create comparison metrics + comparison = { + "modes": modes, + "execution_times": execution_times, + "word_counts": word_counts, + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # Generate comparison charts + self._generate_comparison_charts(comparison) + + # Save comparison results + output_path = os.path.join(self.output_dir, "analysis_modes_comparison.json") + with open(output_path, 'w') as f: + json.dump(comparison, f, indent=2) + + print(f"Analysis modes comparison saved to {output_path}") + + return comparison + + def _generate_comparison_charts(self, comparison): + """Generate charts comparing different analysis modes""" + modes = comparison["modes"] + execution_times = comparison["execution_times"] + word_counts = comparison["word_counts"] + + # Create figure with two subplots + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) + + # Execution time comparison + ax1.bar(modes, execution_times, color='skyblue') + ax1.set_title('Execution Time Comparison') + ax1.set_ylabel('Time (seconds)') + ax1.set_xlabel('Analysis Mode') + + # Word count comparison + ax2.bar(modes, word_counts, color='lightgreen') + ax2.set_title('Word Count Comparison') + ax2.set_ylabel('Total Words') + ax2.set_xlabel('Analysis Mode') + + plt.tight_layout() + + # Save the comparison chart + output_path = os.path.join(self.output_dir, "analysis_modes_comparison.png") + plt.savefig(output_path) + plt.close() + + print(f"Comparison charts saved to {output_path}") + + def test_prompt_variations(self, api_key=None, section="overview"): + """Test variations of prompts for a specific section to optimize prompt engineering""" + if not api_key: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("Error: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or provide it as an argument.") + return None + + print(f"Testing prompt variations for section: {section}...") + + # Define prompt variations + variations = { + "standard": self.prompt_templates.get_template(section), + "concise": self._create_concise_variation(section), + "detailed": self._create_detailed_variation(section), + "structured": self._create_structured_variation(section) + } + + results = {} + + # Test each variation + for variation_name, prompt in variations.items(): + print(f"Testing {variation_name} variation...") + + # Prepare the API request + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + # Prepare the API request payload + payload = { + "model": "gpt-4", + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "max_tokens": 2000 + } + + try: + # Make the API request + start_time = time.time() + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=payload + ) + end_time = time.time() + + # Check for errors + response.raise_for_status() + + # Parse the response + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + analysis = result["choices"][0]["message"]["content"] + + # Save the analysis + output_path = os.path.join(self.output_dir, f"{section}_{variation_name}_variation.md") + with open(output_path, "w") as f: + f.write(analysis) + + # Calculate metrics + word_count = len(analysis.split()) + execution_time = end_time - start_time + + # Evaluate quality + quality_metrics = self._evaluate_single_analysis_quality(analysis, section) + + results[variation_name] = { + "word_count": word_count, + "execution_time": execution_time, + "quality_metrics": quality_metrics + } + + print(f"{variation_name} variation test completed in {execution_time:.2f} seconds") + print(f"Word count: {word_count}") + else: + print(f"Error: Unexpected response format from OpenAI API for {variation_name} variation") + + except Exception as e: + print(f"Error calling OpenAI API for {variation_name} variation: {e}") + + # Compare variations + self._compare_prompt_variations(results, section) + + # Save results + output_path = os.path.join(self.output_dir, f"{section}_prompt_variations_results.json") + with open(output_path, 'w') as f: + json.dump(results, f, indent=2) + + print(f"Prompt variation test results saved to {output_path}") + + return results + + def _create_concise_variation(self, section): + """Create a more concise variation of the prompt for the specified section""" + template = self.prompt_templates.get_template(section) + + # Simplify the prompt by removing detailed explanations and examples + concise = re.sub(r"For reference.*?(?=\n\n)", "", template, flags=re.DOTALL) + concise = re.sub(r"## Output Format.*?(?=\n\n)", "## Output Format\n\nProvide a concise analysis focusing on the most important aspects.", concise, flags=re.DOTALL) + + return concise + + def _create_detailed_variation(self, section): + """Create a more detailed variation of the prompt for the specified section""" + template = self.prompt_templates.get_template(section) + + # Add more detailed instructions and examples + detailed = template + "\n\n## Additional Guidance\n\nPlease provide a very detailed analysis with specific examples from the data. Include quantitative assessments where possible and make sure to thoroughly explain all recommendations. Consider edge cases and potential trade-offs in your analysis." + + return detailed + + def _create_structured_variation(self, section): + """Create a more structured variation of the prompt for the specified section""" + template = self.prompt_templates.get_template(section) + + # Add more structured output format requirements + structured = re.sub(r"## Output Format.*?(?=\n\n)", """## Output Format + +Please structure your response using the following exact format: + +``` +# [Section Title] + +## Key Findings +1. [First key finding] +2. [Second key finding] +3. [Third key finding] + +## Detailed Analysis +[Detailed analysis with subsections] + +## Recommendations +1. [First recommendation] + - Implementation: [How to implement] + - Impact: [Expected impact] +2. [Second recommendation] + - Implementation: [How to implement] + - Impact: [Expected impact] +3. [Third recommendation] + - Implementation: [How to implement] + - Impact: [Expected impact] +``` + +Follow this structure exactly, filling in the appropriate content for each section. +""", template, flags=re.DOTALL) + + return structured + + def _evaluate_single_analysis_quality(self, analysis, section): + """Evaluate the quality of a single analysis""" + # Define expected content patterns for the section + expected_patterns = { + "overview": [ + r"application characteristics", + r"key patterns", + r"performance considerations", + r"implementation quality" + ], + "api_distribution": [ + r"most frequent(ly)?\s+used API", + r"balance between", + r"operation types", + r"inefficient", + r"recommendations" + ], + "memory_operations": [ + r"memory transfer patterns", + r"efficiency", + r"balance between", + r"bottlenecks", + r"recommendations" + ], + "kernel_launches": [ + r"kernel launch patterns", + r"grid and block dimensions", + r"occupancy", + r"efficiency", + r"recommendations" + ], + "performance_bottlenecks": [ + r"bottleneck", + r"impact", + r"root cause", + r"recommendations", + r"performance gains" + ], + "optimization_recommendations": [ + r"recommendation", + r"implementation", + r"code example", + r"expected impact", + r"performance improvement" + ] + } + + if section not in expected_patterns: + return {} + + metrics = { + "pattern_matches": {} + } + + # Check for expected patterns + for pattern in expected_patterns[section]: + matches = re.findall(pattern, analysis.lower()) + metrics["pattern_matches"][pattern] = len(matches) + + # Calculate pattern coverage + total_patterns = len(expected_patterns[section]) + matched_patterns = sum(1 for count in metrics["pattern_matches"].values() if count > 0) + metrics["pattern_coverage"] = matched_patterns / total_patterns if total_patterns > 0 else 0 + + # Check for code examples in optimization recommendations + if section == "optimization_recommendations": + code_pattern = r"```(cuda|c\+\+)?\s.*?```" + code_matches = re.findall(code_pattern, analysis, re.DOTALL) + metrics["code_examples"] = len(code_matches) + + # Check for structure (headings, lists) + heading_pattern = r"#+\s+.+" + heading_matches = re.findall(heading_pattern, analysis) + metrics["headings"] = len(heading_matches) + + list_pattern = r"^\s*\d+\.\s+.+|^\s*-\s+.+" + list_matches = re.findall(list_pattern, analysis, re.MULTILINE) + metrics["list_items"] = len(list_matches) + + return metrics + + def _compare_prompt_variations(self, results, section): + """Compare different prompt variations based on test results""" + if len(results) < 2: + print("Error: At least two variation results are required for comparison.") + return + + # Extract data for comparison + variations = list(results.keys()) + execution_times = [results[var]["execution_time"] for var in variations] + word_counts = [results[var]["word_count"] for var in variations] + pattern_coverages = [results[var]["quality_metrics"]["pattern_coverage"] for var in variations] + + # Create figure with three subplots + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) + + # Execution time comparison + ax1.bar(variations, execution_times, color='skyblue') + ax1.set_title('Execution Time Comparison') + ax1.set_ylabel('Time (seconds)') + ax1.set_xlabel('Prompt Variation') + plt.setp(ax1.get_xticklabels(), rotation=45, ha='right') + + # Word count comparison + ax2.bar(variations, word_counts, color='lightgreen') + ax2.set_title('Word Count Comparison') + ax2.set_ylabel('Total Words') + ax2.set_xlabel('Prompt Variation') + plt.setp(ax2.get_xticklabels(), rotation=45, ha='right') + + # Pattern coverage comparison + ax3.bar(variations, pattern_coverages, color='salmon') + ax3.set_title('Pattern Coverage Comparison') + ax3.set_ylabel('Coverage (0-1)') + ax3.set_xlabel('Prompt Variation') + plt.setp(ax3.get_xticklabels(), rotation=45, ha='right') + + plt.tight_layout() + + # Save the comparison chart + output_path = os.path.join(self.output_dir, f"{section}_prompt_variations_comparison.png") + plt.savefig(output_path) + plt.close() + + print(f"Prompt variations comparison chart saved to {output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test and evaluate LLM analysis quality for CUDA trace data") + parser.add_argument("--analysis_dir", required=True, help="Path to analysis results directory") + parser.add_argument("--enhanced_dir", default=None, help="Path to enhanced visualizations directory") + parser.add_argument("--output_dir", default=None, help="Path to output directory for test results") + parser.add_argument("--test_mock", action="store_true", help="Test mock analysis") + parser.add_argument("--test_openai", action="store_true", help="Test OpenAI analysis") + parser.add_argument("--test_local", action="store_true", help="Test local LLM analysis") + parser.add_argument("--api_key", help="OpenAI API key (for OpenAI tests)") + parser.add_argument("--model_endpoint", default="http://localhost:8000/v1/chat/completions", help="Local LLM API endpoint (for local tests)") + parser.add_argument("--test_prompt_variations", action="store_true", help="Test prompt variations") + parser.add_argument("--section", default="overview", help="Section to test prompt variations for") + + args = parser.parse_args() + + tester = CUDALLMAnalysisTester(args.analysis_dir, args.enhanced_dir, args.output_dir) + + test_results = [] + + if args.test_mock: + mock_results = tester.test_mock_analysis() + test_results.append(mock_results) + + if args.test_openai: + openai_results = tester.test_openai_analysis(args.api_key) + if openai_results: + test_results.append(openai_results[0]) + + if args.test_local: + local_results = tester.test_local_llm_analysis(args.model_endpoint) + if local_results: + test_results.append(local_results[0]) + + if len(test_results) >= 2: + tester.compare_analysis_modes(test_results) + + if args.test_prompt_variations: + tester.test_prompt_variations(args.api_key, args.section) diff --git a/llm-analysis/cuda_prompt_templates.py b/llm-analysis/cuda_prompt_templates.py new file mode 100755 index 0000000..b9b0018 --- /dev/null +++ b/llm-analysis/cuda_prompt_templates.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +CUDA Trace LLM Prompt Templates - Enhanced prompt templates for LLM analysis of CUDA trace data +""" + +import json +import os + +class CUDAPromptTemplates: + """Provides enhanced prompt templates for LLM analysis of CUDA trace data""" + + def __init__(self): + """Initialize the prompt templates""" + self.templates = self._load_default_templates() + + def _load_default_templates(self): + """Load the default prompt templates""" + return { + "overview": self._get_overview_template(), + "api_distribution": self._get_api_distribution_template(), + "memory_operations": self._get_memory_operations_template(), + "kernel_launches": self._get_kernel_launches_template(), + "performance_bottlenecks": self._get_performance_bottlenecks_template(), + "optimization_recommendations": self._get_optimization_recommendations_template() + } + + def _get_overview_template(self): + """Get the template for the overview section""" + return """ + # CUDA Trace Analysis - Overview + + I need a comprehensive overview of the CUDA trace data provided. The data comes from a BPF program that traces various CUDA API routines. + + ## Analysis Task + + Please provide a high-level executive summary of the CUDA trace data that includes: + + 1. The main characteristics of the application based on its CUDA API usage + 2. The most significant patterns observed in the trace data + 3. The key performance considerations identified + 4. A brief assessment of the application's CUDA implementation quality + + ## Available Data + + ### Summary Statistics + + - Total trace entries: {total_trace_entries} + - Unique API functions: {unique_api_functions} + - Unique kernels: {unique_kernels} + - Trace duration: {trace_duration_seconds} seconds + + ### Dashboard Visualization + + The dashboard visualization (attached as an image) provides an overview of all aspects of the CUDA trace analysis, including API distribution, kernel launches, memory operations, and performance insights. + + ## Output Format + + Please structure your response as follows: + + 1. **Application Characteristics**: A paragraph describing the main characteristics of the application based on its CUDA API usage. + 2. **Key Patterns Observed**: A list of the most significant patterns observed in the trace data. + 3. **Performance Considerations**: A list of key performance considerations identified from the trace data. + 4. **Implementation Quality Assessment**: A paragraph assessing the quality of the CUDA implementation. + + Focus on providing a concise but comprehensive overview that highlights the most important aspects of the trace data. This overview will serve as an executive summary for the detailed analysis that follows. + """ + + def _get_api_distribution_template(self): + """Get the template for the API distribution section""" + return """ + # CUDA Trace Analysis - API Distribution + + I need a detailed analysis of the CUDA API distribution in the trace data. + + ## Analysis Task + + Please analyze the API distribution data and provide: + + 1. Insights into the most frequently used CUDA API functions and what they indicate about the application + 2. Analysis of the balance between different types of operations (compute, memory, synchronization) + 3. Identification of any unusual or inefficient API usage patterns + 4. Recommendations for optimizing the API usage + + ## Available Data + + ### API Distribution Table + + {api_distribution_table} + + ### API Distribution Visualization + + The API distribution visualization (attached as an image) shows the frequency of different CUDA API calls. Pay special attention to: + + - The relative proportions of different API types + - The dominance of specific API calls + - Any unusual patterns in the distribution + + ### API Categories + + For reference, CUDA API functions can be categorized as follows: + + - **Compute Operations**: cudaLaunchKernel, cudaLaunchCooperativeKernel + - **Memory Operations**: cudaMalloc, cudaFree, cudaMemcpy, cudaMemcpyAsync, cudaMemset + - **Synchronization**: cudaStreamSynchronize, cudaDeviceSynchronize, cudaEventSynchronize + - **Stream Management**: cudaStreamCreate, cudaStreamDestroy, cudaStreamWaitEvent + - **Event Management**: cudaEventCreate, cudaEventRecord, cudaEventElapsedTime + + ## Output Format + + Please structure your response as follows: + + 1. **Most Frequently Used API Functions**: Analysis of the top API functions and what they indicate about the application. + 2. **Balance Between Operation Types**: Analysis of the distribution of different operation types. + 3. **Inefficient API Usage Patterns**: Identification of any unusual or inefficient patterns. + 4. **Recommendations for Optimizing API Usage**: Specific recommendations for improving API usage. + + Be specific in your analysis, referencing the actual data from the table and visualization. Provide concrete recommendations that could improve the application's performance. + """ + + def _get_memory_operations_template(self): + """Get the template for the memory operations section""" + return """ + # CUDA Trace Analysis - Memory Operations + + I need a detailed analysis of the memory operations in the CUDA trace data. + + ## Analysis Task + + Please analyze the memory operations data and provide: + + 1. Assessment of the memory transfer patterns and their efficiency + 2. Analysis of the balance between different types of memory operations + 3. Identification of potential memory-related bottlenecks + 4. Recommendations for optimizing memory usage and transfers + + ## Available Data + + ### Memory Operations Table + + {memory_operations_table} + + ### Memory Operations Visualization + + The memory operations visualization (attached as an image) shows the distribution of memory-related operations. Pay special attention to: + + - The relative proportions of different memory operation types + - The balance between synchronous and asynchronous operations + - The frequency of allocation and deallocation operations + + ### Memory Operation Types + + For reference, CUDA memory operations include: + + - **Data Transfer**: cudaMemcpy (synchronous), cudaMemcpyAsync (asynchronous) + - **Memory Allocation**: cudaMalloc, cudaMallocHost, cudaMallocPitch + - **Memory Deallocation**: cudaFree, cudaFreeHost + - **Memory Setting**: cudaMemset, cudaMemsetAsync + + ## Memory Transfer Efficiency Considerations + + - **Synchronous vs. Asynchronous**: Asynchronous transfers (cudaMemcpyAsync) allow overlapping with computation + - **Transfer Size**: Larger transfers are generally more efficient than many small transfers + - **Transfer Frequency**: Frequent transfers can indicate inefficient data management + - **Pinned Memory**: Transfers using pinned host memory are more efficient + + ## Output Format + + Please structure your response as follows: + + 1. **Memory Transfer Patterns and Efficiency**: Analysis of how data is being transferred between host and device. + 2. **Balance Between Memory Operation Types**: Analysis of the distribution of different memory operation types. + 3. **Potential Memory-Related Bottlenecks**: Identification of inefficiencies in memory usage. + 4. **Recommendations for Optimizing Memory Usage**: Specific recommendations for improving memory operations. + + Be specific in your analysis, referencing the actual data from the table and visualization. Provide concrete recommendations that could improve the application's memory usage efficiency. + """ + + def _get_kernel_launches_template(self): + """Get the template for the kernel launches section""" + return """ + # CUDA Trace Analysis - Kernel Launches + + I need a detailed analysis of the kernel launch patterns in the CUDA trace data. + + ## Analysis Task + + Please analyze the kernel launch data and provide: + + 1. Assessment of the kernel launch patterns and their implications for performance + 2. Analysis of the grid and block dimensions used for kernel launches + 3. Evaluation of kernel occupancy and efficiency based on the launch parameters + 4. Recommendations for optimizing kernel launch configurations + + ## Available Data + + ### Kernel Distribution Table + + {kernel_distribution_table} + + ### Grid/Block Dimensions Table + + {grid_block_dimensions_table} + + ### Kernel Launch Visualization + + The kernel launch visualization (attached as an image) shows the distribution of kernel launches. Pay special attention to: + + - The relative frequency of different kernels + - The patterns in grid and block dimensions + - Any unusual or suboptimal launch configurations + + ### GPU Occupancy Considerations + + For reference, kernel launch efficiency depends on: + + - **Block Size**: Typically, block sizes between 128 and 256 threads provide good occupancy + - **Grid Size**: Should be large enough to fully utilize the GPU + - **Dimensions**: 1D, 2D, or 3D configurations should match the data structure + - **Resource Usage**: Registers and shared memory per thread affect occupancy + + ## Output Format + + Please structure your response as follows: + + 1. **Kernel Launch Patterns**: Analysis of how kernels are being launched and the implications for performance. + 2. **Grid and Block Dimensions Analysis**: Evaluation of the grid and block dimensions used. + 3. **Kernel Occupancy and Efficiency**: Assessment of how well the kernels are likely to utilize the GPU. + 4. **Recommendations for Optimizing Kernel Launches**: Specific recommendations for improving kernel launch configurations. + + Be specific in your analysis, referencing the actual data from the tables and visualization. Consider the implications of the launch patterns for GPU utilization and overall performance. Provide concrete recommendations that could improve kernel execution efficiency. + """ + + def _get_performance_bottlenecks_template(self): + """Get the template for the performance bottlenecks section""" + return """ + # CUDA Trace Analysis - Performance Bottlenecks + + I need a detailed analysis of the performance bottlenecks identified in the CUDA trace data. + + ## Analysis Task + + Please analyze the performance bottlenecks data and provide: + + 1. Detailed explanation of each identified bottleneck and its impact on performance + 2. Root cause analysis for each bottleneck + 3. Prioritized recommendations for addressing each bottleneck + 4. Potential performance gains from implementing the recommendations + + ## Available Data + + ### Performance Bottlenecks Table + + {performance_bottlenecks_table} + + ### Bottleneck Categories + + For reference, common CUDA performance bottlenecks include: + + - **Memory Transfer Overhead**: Excessive time spent transferring data between host and device + - **Synchronization Points**: Excessive synchronization limiting parallelism + - **Low GPU Occupancy**: Suboptimal kernel launch parameters leading to underutilization + - **Memory Access Patterns**: Uncoalesced memory access reducing memory throughput + - **Divergent Execution**: Warp divergence reducing computational efficiency + - **Resource Contention**: Excessive register or shared memory usage limiting occupancy + + ## Output Format + + Please structure your response as follows: + + For each bottleneck identified in the table: + + 1. **Detailed Explanation**: A thorough explanation of the bottleneck and its impact on performance. + 2. **Root Cause Analysis**: An analysis of the underlying causes of the bottleneck. + 3. **Prioritized Recommendations**: Specific, actionable recommendations for addressing the bottleneck. + 4. **Potential Performance Gains**: An estimate of the performance improvement that could be achieved. + + Conclude with a summary of all bottlenecks in order of priority, with an overall assessment of the potential performance improvement if all recommendations are implemented. + + Be specific in your analysis, referencing the actual data from the table. Provide concrete, actionable recommendations that could improve the application's performance. + """ + + def _get_optimization_recommendations_template(self): + """Get the template for the optimization recommendations section""" + return """ + # CUDA Trace Analysis - Optimization Recommendations + + I need comprehensive optimization recommendations based on the CUDA trace data analysis. + + ## Analysis Task + + Please provide: + + 1. A prioritized list of optimization recommendations + 2. Detailed explanation of each recommendation and its expected impact + 3. Implementation guidance for each recommendation, including code examples where applicable + 4. Estimation of potential performance improvements for each recommendation + + ## Available Data + + The recommendations should be based on all the analyses performed on the trace data, including: + - API distribution analysis + - Memory operations analysis + - Kernel launch analysis + - Performance bottlenecks analysis + + ## CUDA Optimization Best Practices + + For reference, consider these CUDA optimization best practices: + + - **Memory Transfers**: Minimize host-device transfers, use pinned memory, batch transfers + - **Asynchronous Execution**: Use streams for overlapping operations, minimize synchronization + - **Memory Access Patterns**: Ensure coalesced memory access, use shared memory for data reuse + - **Kernel Launch Configuration**: Optimize block size for occupancy, match grid dimensions to data + - **Resource Usage**: Manage register and shared memory usage to maximize occupancy + - **Algorithm Design**: Restructure algorithms to maximize parallelism and minimize dependencies + + ## Output Format + + Please structure your response as follows: + + For each recommendation (in order of priority): + + 1. **Recommendation**: A clear statement of the recommended optimization. + 2. **Detailed Explanation**: Why this optimization is important and how it addresses issues in the trace data. + 3. **Implementation Guidance**: Specific steps to implement the recommendation, including code examples. + 4. **Expected Impact**: An estimate of the performance improvement that could be achieved. + + Conclude with a summary of the expected overall performance improvement if all recommendations are implemented. + + Be specific and actionable in your recommendations. Provide concrete code examples that demonstrate how to implement each optimization. Focus on recommendations that are likely to have the most significant impact on performance. + """ + + def get_template(self, section, data=None): + """Get a template for a specific section with data filled in""" + if section not in self.templates: + raise ValueError(f"Unknown section: {section}") + + template = self.templates[section] + + if data: + # Fill in the template with the provided data + template = template.format(**data) + + return template + + def save_templates(self, output_dir): + """Save all templates to files in the specified directory""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + for section, template in self.templates.items(): + output_path = os.path.join(output_dir, f"{section}_template.txt") + with open(output_path, "w") as f: + f.write(template) + + print(f"Templates saved to {output_dir}") + + def load_templates(self, input_dir): + """Load templates from files in the specified directory""" + templates = {} + + for section in self.templates.keys(): + input_path = os.path.join(input_dir, f"{section}_template.txt") + if os.path.exists(input_path): + with open(input_path, "r") as f: + templates[section] = f.read() + + if templates: + self.templates.update(templates) + print(f"Templates loaded from {input_dir}") + else: + print(f"No templates found in {input_dir}") + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Manage CUDA trace LLM prompt templates") + parser.add_argument("--save", help="Directory to save templates to") + parser.add_argument("--load", help="Directory to load templates from") + + args = parser.parse_args() + + templates = CUDAPromptTemplates() + + if args.save: + templates.save_templates(args.save) + + if args.load: + templates.load_templates(args.load) diff --git a/llm-analysis/cuda_trace_analyzer.py b/llm-analysis/cuda_trace_analyzer.py new file mode 100644 index 0000000..f0e70c4 --- /dev/null +++ b/llm-analysis/cuda_trace_analyzer.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +CUDA Trace Analyzer - Analyzes CUDA trace data and generates insights +""" + +import pandas as pd +import json +import matplotlib.pyplot as plt +import seaborn as sns +from collections import defaultdict, Counter +import numpy as np +import os +import argparse +import sys +from datetime import datetime +import matplotlib.ticker as ticker +import matplotlib.dates as mdates +from matplotlib.colors import LinearSegmentedColormap +import re + +class CUDATraceAnalyzer: + """Analyzer for CUDA trace data""" + + def __init__(self, data_path, output_dir=None): + """Initialize the analyzer with the path to the parsed data""" + self.data_path = data_path + self.df = None + self.analysis_results = {} + + # Use provided output directory or default to current directory + if output_dir: + self.output_dir = output_dir + else: + self.output_dir = os.path.join(os.path.dirname(os.path.abspath(data_path)), "analysis_results") + + # Create output directory if it doesn't exist + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + + print(f"Analysis results will be saved to: {self.output_dir}") + + def load_data(self): + """Load the parsed trace data""" + print(f"Loading data from {self.data_path}") + + # Check if the data file exists + if not os.path.exists(self.data_path): + raise FileNotFoundError(f"Parsed trace data file not found: {self.data_path}") + + with open(self.data_path, 'r') as f: + data = json.load(f) + + self.df = pd.DataFrame(data) + + # Convert timestamp to datetime + if 'timestamp' in self.df.columns: + self.df['datetime'] = pd.to_datetime(self.df['timestamp']) + + # Calculate relative time from the first timestamp + first_time = self.df['datetime'].min() + self.df['relative_time'] = (self.df['datetime'] - first_time).dt.total_seconds() + + # Clean up grid and block dimensions + for col in ['grid_x', 'grid_y', 'grid_z', 'block_x', 'block_y', 'block_z']: + if col in self.df.columns: + # Convert string values to numeric where possible + self.df[col] = pd.to_numeric(self.df[col], errors='ignore') + + # Extract kernel name from args if available + if 'args' in self.df.columns: + self.df['kernel_name'] = self.df['args'].apply( + lambda x: re.match(r'^(\w+)', str(x)).group(1) if isinstance(x, str) and re.match(r'^(\w+)', str(x)) else None + ) + + print(f"Loaded {len(self.df)} trace entries") + return self.df + + def analyze(self): + """Perform comprehensive analysis on the trace data""" + if self.df is None: + self.load_data() + + print("Performing analysis on trace data...") + + # Basic statistics + self.analyze_api_distribution() + self.analyze_kernel_launches() + self.analyze_temporal_patterns() + self.analyze_memory_operations() + self.analyze_stack_traces() + + print("Analysis complete") + return self.analysis_results + + def analyze_api_distribution(self): + """Analyze the distribution of CUDA API calls""" + print("Analyzing CUDA API call distribution...") + + # Count occurrences of each CUDA API function + api_counts = self.df['cuda_api_function'].value_counts().reset_index() + api_counts.columns = ['API Function', 'Count'] + + # Save to analysis results + self.analysis_results['api_distribution'] = api_counts.to_dict('records') + + # Create visualization + plt.figure(figsize=(12, 8)) + sns.barplot(x='Count', y='API Function', data=api_counts.head(15), palette='viridis') + plt.title('Top 15 CUDA API Functions by Frequency') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/api_distribution.png", dpi=300) + plt.close() + + return api_counts + + def analyze_kernel_launches(self): + """Analyze kernel launch patterns""" + print("Analyzing kernel launch patterns...") + + # Filter for kernel launches + kernel_launches = self.df[self.df['cuda_api_function'] == 'cudaLaunchKernel'].copy() + + if len(kernel_launches) == 0: + print("No kernel launches found in the trace data") + return None + + # Analyze kernel names + if 'kernel_name' in kernel_launches.columns: + kernel_name_counts = kernel_launches['kernel_name'].value_counts().reset_index() + kernel_name_counts.columns = ['Kernel Name', 'Count'] + self.analysis_results['kernel_name_distribution'] = kernel_name_counts.to_dict('records') + + # Create visualization + plt.figure(figsize=(12, 8)) + sns.barplot(x='Count', y='Kernel Name', data=kernel_name_counts.head(15), palette='magma') + plt.title('Kernel Launch Frequency by Kernel Name') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/kernel_name_distribution.png", dpi=300) + plt.close() + + # Analyze grid and block dimensions + grid_block_dims = [] + for col in ['grid_x', 'grid_y', 'grid_z', 'block_x', 'block_y', 'block_z']: + if col in kernel_launches.columns: + # Get the most common values + top_values = kernel_launches[col].value_counts().head(5).reset_index() + top_values.columns = ['Value', 'Count'] + top_values['Dimension'] = col + grid_block_dims.append(top_values) + + if grid_block_dims: + grid_block_df = pd.concat(grid_block_dims) + self.analysis_results['grid_block_dimensions'] = grid_block_df.to_dict('records') + + # Create visualization for grid dimensions + plt.figure(figsize=(15, 10)) + grid_dims = grid_block_df[grid_block_df['Dimension'].str.startswith('grid_')] + if not grid_dims.empty: + sns.barplot(x='Value', y='Count', hue='Dimension', data=grid_dims, palette='cool') + plt.title('Distribution of Grid Dimensions in Kernel Launches') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/grid_dimensions.png", dpi=300) + plt.close() + + # Create visualization for block dimensions + plt.figure(figsize=(15, 10)) + block_dims = grid_block_df[grid_block_df['Dimension'].str.startswith('block_')] + if not block_dims.empty: + sns.barplot(x='Value', y='Count', hue='Dimension', data=block_dims, palette='plasma') + plt.title('Distribution of Block Dimensions in Kernel Launches') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/block_dimensions.png", dpi=300) + plt.close() + + return kernel_launches + + def analyze_temporal_patterns(self): + """Analyze temporal patterns in the trace data""" + print("Analyzing temporal patterns...") + + if 'relative_time' not in self.df.columns: + print("No temporal data available for analysis") + return None + + # Create a timeline of API calls + timeline_data = self.df.copy() + + # Group by timestamp and count API calls + timeline = timeline_data.groupby('relative_time')['cuda_api_function'].count().reset_index() + timeline.columns = ['Relative Time (s)', 'API Call Count'] + + self.analysis_results['temporal_distribution'] = timeline.to_dict('records') + + # Create visualization + plt.figure(figsize=(15, 8)) + plt.plot(timeline['Relative Time (s)'], timeline['API Call Count'], marker='o', linestyle='-', color='blue') + plt.title('CUDA API Call Frequency Over Time') + plt.xlabel('Relative Time (seconds)') + plt.ylabel('Number of API Calls') + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig(f"{self.output_dir}/api_call_timeline.png", dpi=300) + plt.close() + + # Analyze API call distribution over time + if len(timeline_data) > 0: + # Get top 5 API functions + top_apis = timeline_data['cuda_api_function'].value_counts().head(5).index.tolist() + + # Filter for top APIs + top_api_data = timeline_data[timeline_data['cuda_api_function'].isin(top_apis)] + + # Create a pivot table for the heatmap + api_time_pivot = pd.pivot_table( + top_api_data, + values='pid', # Just need a column to count + index='cuda_api_function', + columns=pd.cut(top_api_data['relative_time'], bins=10), + aggfunc='count', + fill_value=0 + ) + + # Create visualization + plt.figure(figsize=(15, 8)) + sns.heatmap(api_time_pivot, cmap='YlGnBu', annot=True, fmt='g') + plt.title('Distribution of Top 5 CUDA API Calls Over Time') + plt.xlabel('Relative Time Bins') + plt.ylabel('CUDA API Function') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/api_time_heatmap.png", dpi=300) + plt.close() + + return timeline + + def analyze_memory_operations(self): + """Analyze memory-related operations""" + print("Analyzing memory operations...") + + # Filter for memory-related API calls + memory_ops = self.df[ + self.df['cuda_api_function'].str.contains('cudaMalloc|cudaFree|cudaMemcpy|cudaMemcpyAsync', na=False) + ].copy() + + if len(memory_ops) == 0: + print("No memory operations found in the trace data") + return None + + # Count by operation type + memory_op_counts = memory_ops['cuda_api_function'].value_counts().reset_index() + memory_op_counts.columns = ['Memory Operation', 'Count'] + + self.analysis_results['memory_operations'] = memory_op_counts.to_dict('records') + + # Create visualization + plt.figure(figsize=(12, 8)) + sns.barplot(x='Count', y='Memory Operation', data=memory_op_counts, palette='Oranges_r') + plt.title('Distribution of CUDA Memory Operations') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/memory_operations.png", dpi=300) + plt.close() + + # Analyze memory operations over time if temporal data is available + if 'relative_time' in memory_ops.columns: + plt.figure(figsize=(15, 8)) + + # Group by time and operation type + memory_timeline = memory_ops.groupby(['relative_time', 'cuda_api_function']).size().reset_index() + memory_timeline.columns = ['Relative Time (s)', 'Memory Operation', 'Count'] + + # Plot + for op in memory_op_counts['Memory Operation'].unique(): + op_data = memory_timeline[memory_timeline['Memory Operation'] == op] + plt.plot(op_data['Relative Time (s)'], op_data['Count'], marker='o', linestyle='-', label=op) + + plt.title('Memory Operations Over Time') + plt.xlabel('Relative Time (seconds)') + plt.ylabel('Operation Count') + plt.legend() + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig(f"{self.output_dir}/memory_operations_timeline.png", dpi=300) + plt.close() + + return memory_op_counts + + def analyze_stack_traces(self): + """Analyze stack trace patterns""" + print("Analyzing stack trace patterns...") + + # Extract call sites from stack traces + if 'stack_trace' not in self.df.columns: + print("No stack trace data available for analysis") + return None + + # Get the second frame in each stack trace (the caller of the CUDA API) + call_sites = [] + for stack in self.df['stack_trace']: + if isinstance(stack, list) and len(stack) > 1: + call_sites.append(stack[1]) + else: + call_sites.append('Unknown') + + self.df['call_site'] = call_sites + + # Count occurrences of each call site + call_site_counts = self.df['call_site'].value_counts().reset_index() + call_site_counts.columns = ['Call Site', 'Count'] + + self.analysis_results['call_site_distribution'] = call_site_counts.to_dict('records') + + # Create visualization + plt.figure(figsize=(12, 8)) + sns.barplot(x='Count', y='Call Site', data=call_site_counts.head(15), palette='rocket') + plt.title('Top 15 Call Sites for CUDA API Functions') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/call_site_distribution.png", dpi=300) + plt.close() + + # Analyze relationship between call sites and API functions + call_site_api = self.df.groupby(['call_site', 'cuda_api_function']).size().reset_index() + call_site_api.columns = ['Call Site', 'CUDA API Function', 'Count'] + call_site_api = call_site_api.sort_values('Count', ascending=False).head(20) + + self.analysis_results['call_site_api_relationship'] = call_site_api.to_dict('records') + + # Create visualization + plt.figure(figsize=(15, 10)) + pivot_table = call_site_api.pivot_table( + values='Count', + index='Call Site', + columns='CUDA API Function', + fill_value=0 + ) + sns.heatmap(pivot_table, cmap='viridis', annot=True, fmt='g') + plt.title('Relationship Between Call Sites and CUDA API Functions') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/call_site_api_heatmap.png", dpi=300) + plt.close() + + return call_site_counts + + def generate_summary(self): + """Generate a summary of the analysis results""" + print("Generating analysis summary...") + + summary = { + "total_trace_entries": len(self.df), + "unique_api_functions": self.df['cuda_api_function'].nunique(), + "top_api_functions": self.df['cuda_api_function'].value_counts().head(5).to_dict(), + } + + if 'kernel_name' in self.df.columns: + kernel_names = self.df['kernel_name'].dropna().unique() + summary["unique_kernels"] = len(kernel_names) + summary["kernel_names"] = kernel_names.tolist() + + if 'relative_time' in self.df.columns: + summary["trace_duration_seconds"] = self.df['relative_time'].max() + + self.analysis_results['summary'] = summary + + # Save summary to file + with open(f"{self.output_dir}/analysis_summary.json", 'w') as f: + json.dump(summary, f, indent=2) + + return summary + + def save_results(self): + """Save all analysis results to a JSON file""" + print("Saving analysis results...") + + # Generate summary if not already done + if 'summary' not in self.analysis_results: + self.generate_summary() + + # Save to file + with open(f"{self.output_dir}/analysis_results.json", 'w') as f: + json.dump(self.analysis_results, f, indent=2, default=str) + + print(f"Analysis results saved to {self.output_dir}/analysis_results.json") + + # Create a list of all generated files + generated_files = [ + f"{self.output_dir}/analysis_results.json", + f"{self.output_dir}/analysis_summary.json" + ] + + for file in os.listdir(self.output_dir): + if file.endswith('.png'): + generated_files.append(f"{self.output_dir}/{file}") + + return generated_files + +if __name__ == "__main__": + # Parse command line arguments + parser = argparse.ArgumentParser(description="CUDA Trace Analyzer") + parser.add_argument("data_path", help="Path to the parsed trace data JSON file") + parser.add_argument("--output_dir", help="Output directory for analysis results") + + args = parser.parse_args() + + # Initialize and run the analyzer + analyzer = CUDATraceAnalyzer(args.data_path, args.output_dir) + analyzer.load_data() + analyzer.analyze() + analyzer.generate_summary() + analyzer.save_results() + + print("Analysis complete. Results saved to output directory.") diff --git a/llm-analysis/cuda_trace_parser.py b/llm-analysis/cuda_trace_parser.py new file mode 100644 index 0000000..16cd2cf --- /dev/null +++ b/llm-analysis/cuda_trace_parser.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +CUDA Trace Parser - Parses CUDA trace files generated by BPF programs +""" + +import re +import pandas as pd +import json +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from collections import defaultdict, Counter +import numpy as np +import os +import argparse +import sys + +class CUDATraceParser: + """Parser for CUDA trace files generated by BPF programs""" + + def __init__(self, trace_file_path): + """Initialize the parser with the path to the trace file""" + self.trace_file_path = trace_file_path + self.entries = [] + self.df = None + self.start_time = None + + def parse(self): + """Parse the trace file and extract structured data""" + print(f"Parsing trace file: {self.trace_file_path}") + + with open(self.trace_file_path, 'r') as f: + content = f.read() + + # Extract the start time + start_time_match = re.search(r'Started profiling at (.+)', content) + if start_time_match: + self.start_time = datetime.strptime(start_time_match.group(1), '%a %b %d %H:%M:%S %Y') + + # Split the content by the separator + entries_raw = content.split('-' * 80) + + # Skip the header information + entries_raw = [entry for entry in entries_raw if '[TIMESTAMP]' in entry] + + for entry in entries_raw: + parsed_entry = self._parse_entry(entry) + if parsed_entry: + self.entries.append(parsed_entry) + + print(f"Parsed {len(self.entries)} trace entries") + + # Convert to DataFrame for easier analysis + self.df = pd.DataFrame(self.entries) + + # Convert timestamp to datetime and calculate relative time + if 'timestamp' in self.df.columns: + self.df['datetime'] = pd.to_datetime(self.df['timestamp']) + if self.start_time: + self.df['relative_time'] = (self.df['datetime'] - pd.to_datetime(self.start_time)).dt.total_seconds() + else: + # If no start time, use the first timestamp as reference + first_time = self.df['datetime'].min() + self.df['relative_time'] = (self.df['datetime'] - first_time).dt.total_seconds() + + return self.df + + def _parse_entry(self, entry): + """Parse a single trace entry""" + if not entry.strip(): + return None + + result = {} + + # Extract timestamp + timestamp_match = re.search(r'\[TIMESTAMP\] (.+)', entry) + if timestamp_match: + result['timestamp'] = timestamp_match.group(1) + + # Extract process info + process_match = re.search(r'\[PROCESS\] (.+) \[(\d+)\] CUDA API EventType (\d+)', entry) + if process_match: + result['process_name'] = process_match.group(1) + result['pid'] = int(process_match.group(2)) + result['event_type'] = int(process_match.group(3)) + + # Extract CUDA API call type + cuda_call_match = re.search(r'\[(CUDA_[A-Z_]+)\]', entry) + if cuda_call_match: + result['cuda_call'] = cuda_call_match.group(1) + + # Extract grid and block dimensions for kernel launches + if 'cuda_call' in result and result['cuda_call'] == 'CUDA_LAUNCH_KERNEL': + grid_match = re.search(r'Grid: \(([^)]+)\)', entry) + block_match = re.search(r'Block: \(([^)]+)\)', entry) + + if grid_match: + grid_dims = grid_match.group(1).split(',') + result['grid_x'] = int(grid_dims[0]) if grid_dims[0].strip().isdigit() else grid_dims[0].strip() + result['grid_y'] = int(grid_dims[1]) if grid_dims[1].strip().isdigit() else grid_dims[1].strip() + result['grid_z'] = int(grid_dims[2]) if grid_dims[2].strip().isdigit() else grid_dims[2].strip() + + if block_match: + block_dims = block_match.group(1).split(',') + result['block_x'] = int(block_dims[0]) if block_dims[0].strip().isdigit() else block_dims[0].strip() + result['block_y'] = int(block_dims[1]) if block_dims[1].strip().isdigit() else block_dims[1].strip() + result['block_z'] = int(block_dims[2]) if block_dims[2].strip().isdigit() else block_dims[2].strip() + + # Extract arguments + args_match = re.search(r'\[ARGS\] (.+)', entry) + if args_match and args_match.group(1).strip(): + result['args'] = args_match.group(1).strip() + + # Try to extract kernel name from args + kernel_name_match = re.match(r'^(\w+)', result['args']) + if kernel_name_match: + result['kernel_name'] = kernel_name_match.group(1) + + # Extract stack trace + stack_trace_match = re.search(r'\[STACK_TRACE\]\s+([\s\S]+?)(?=\Z|^\[)', entry, re.MULTILINE) + if stack_trace_match: + stack_trace = stack_trace_match.group(1).strip().split('\n') + result['stack_trace'] = [frame.strip() for frame in stack_trace if frame.strip()] + + # Extract the CUDA API function from the stack trace + if result['stack_trace']: + result['cuda_api_function'] = result['stack_trace'][0].strip() + + return result + + def save_to_json(self, output_path): + """Save the parsed data to a JSON file""" + if not self.entries: + print("No entries to save. Run parse() first.") + return + + # Convert DataFrame to JSON + with open(output_path, 'w') as f: + # Convert to records format for better readability + json.dump(self.df.to_dict(orient='records'), f, indent=2, default=str) + + print(f"Saved parsed data to {output_path}") + +if __name__ == "__main__": + # Parse command line arguments + parser = argparse.ArgumentParser(description="CUDA Trace Parser") + parser.add_argument("trace_file", help="Path to the CUDA trace file") + parser.add_argument("--output", default="parsed_trace.json", help="Output JSON file path") + + args = parser.parse_args() + + # Use the provided trace file path + trace_parser = CUDATraceParser(args.trace_file) + df = trace_parser.parse() + trace_parser.save_to_json(args.output) + print(f"Parsing complete. Data saved to {args.output}") diff --git a/llm-analysis/cuda_visualization_organizer.py b/llm-analysis/cuda_visualization_organizer.py new file mode 100644 index 0000000..b65b371 --- /dev/null +++ b/llm-analysis/cuda_visualization_organizer.py @@ -0,0 +1,584 @@ +#!/usr/bin/env python3 +""" +CUDA Trace Visualization Organizer - Enhances and organizes visualizations from CUDA trace analysis +""" + +import os +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +import numpy as np +from PIL import Image +import json +import pandas as pd +import seaborn as sns +from matplotlib.colors import LinearSegmentedColormap +import argparse +import sys +from datetime import datetime # Added missing import for datetime + +class CUDAVisualizationOrganizer: + """Organizes and enhances visualizations from CUDA trace analysis""" + + def __init__(self, analysis_dir, output_dir=None): + """Initialize the organizer with the path to the analysis results directory""" + # Convert to absolute paths to avoid path resolution issues + self.analysis_dir = os.path.abspath(analysis_dir) + + # Use provided output directory or default to a subdirectory of analysis_dir + if output_dir: + self.output_dir = os.path.abspath(output_dir) + else: + self.output_dir = os.path.join(self.analysis_dir, "enhanced") + + # Create output directory if it doesn't exist + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + + print(f"Visualization enhancements will be saved to: {self.output_dir}") + + # Load analysis results + self.analysis_results = self._load_analysis_results() + self.summary = self._load_summary() + + def _load_analysis_results(self): + """Load the analysis results from JSON file""" + results_path = os.path.join(self.analysis_dir, "analysis_results.json") + if os.path.exists(results_path): + with open(results_path, 'r') as f: + return json.load(f) + print(f"Warning: Analysis results not found at {results_path}") + return {} + + def _load_summary(self): + """Load the analysis summary from JSON file""" + summary_path = os.path.join(self.analysis_dir, "analysis_summary.json") + if os.path.exists(summary_path): + with open(summary_path, 'r') as f: + return json.load(f) + print(f"Warning: Analysis summary not found at {summary_path}") + return {} + + def create_dashboard(self): + """Create a comprehensive dashboard of visualizations""" + print("Creating visualization dashboard...") + + # Get list of all PNG files in the analysis directory + png_files = [f for f in os.listdir(self.analysis_dir) if f.endswith('.png')] + + if not png_files: + print("No visualization files found in the analysis directory") + return None + + # Create a figure with subplots for each visualization + fig = plt.figure(figsize=(20, 25)) + fig.suptitle("CUDA Trace Analysis Dashboard", fontsize=24, y=0.98) + + # Add summary text at the top + summary_text = self._generate_summary_text() + fig.text(0.5, 0.95, summary_text, ha='center', va='top', fontsize=14, + bbox=dict(boxstyle="round,pad=0.5", facecolor='lightgray', alpha=0.5)) + + # Create a grid layout + gs = gridspec.GridSpec(4, 2, figure=fig, hspace=0.4, wspace=0.3) + + # Organize visualizations by category + api_viz = [f for f in png_files if 'api_distribution' in f] + kernel_viz = [f for f in png_files if 'kernel' in f] + memory_viz = [f for f in png_files if 'memory' in f] + timeline_viz = [f for f in png_files if 'timeline' in f or 'time' in f] + stack_viz = [f for f in png_files if 'stack' in f or 'call_site' in f] + + # Add visualizations to the dashboard + self._add_visualization(fig, gs[0, 0], api_viz, "API Distribution") + self._add_visualization(fig, gs[0, 1], kernel_viz, "Kernel Analysis") + self._add_visualization(fig, gs[1, 0], memory_viz, "Memory Operations") + self._add_visualization(fig, gs[1, 1], timeline_viz, "Temporal Analysis") + self._add_visualization(fig, gs[2, 0], stack_viz, "Stack Trace Analysis") + + # Add custom visualizations + self._create_summary_chart(fig, gs[2, 1]) + self._create_api_proportion_chart(fig, gs[3, 0]) + self._create_performance_insights_chart(fig, gs[3, 1]) + + # Save the dashboard + dashboard_path = os.path.join(self.output_dir, "cuda_trace_dashboard.png") + plt.savefig(dashboard_path, dpi=300, bbox_inches='tight') + plt.close(fig) + + print(f"Dashboard saved to {dashboard_path}") + return dashboard_path + + def _add_visualization(self, fig, grid_pos, viz_files, title): + """Add a visualization to the dashboard""" + if not viz_files: + # Create empty subplot with message + ax = fig.add_subplot(grid_pos) + ax.text(0.5, 0.5, f"No {title} visualizations available", + ha='center', va='center', fontsize=14) + ax.axis('off') + return + + # Use the first visualization in the category + viz_path = os.path.join(self.analysis_dir, viz_files[0]) + + try: + # Load the image + img = plt.imread(viz_path) + + # Create subplot + ax = fig.add_subplot(grid_pos) + ax.imshow(img) + ax.set_title(title, fontsize=16) + ax.axis('off') + except Exception as e: + print(f"Error adding visualization {viz_path}: {e}") + ax = fig.add_subplot(grid_pos) + ax.text(0.5, 0.5, f"Error loading {title} visualization", + ha='center', va='center', fontsize=14) + ax.axis('off') + + def _generate_summary_text(self): + """Generate a summary text from the analysis results""" + if not self.summary: + return "No summary data available" + + total_entries = self.summary.get('total_trace_entries', 'Unknown') + unique_apis = self.summary.get('unique_api_functions', 'Unknown') + unique_kernels = self.summary.get('unique_kernels', 'Unknown') + duration = self.summary.get('trace_duration_seconds', 'Unknown') + + summary_text = ( + f"Summary: {total_entries} trace entries analyzed over {duration} seconds\n" + f"Unique API Functions: {unique_apis} | Unique Kernels: {unique_kernels}" + ) + + return summary_text + + def _create_summary_chart(self, fig, grid_pos): + """Create a summary chart showing the distribution of API types""" + ax = fig.add_subplot(grid_pos) + + if not self.analysis_results or 'api_distribution' not in self.analysis_results: + ax.text(0.5, 0.5, "No API distribution data available", + ha='center', va='center', fontsize=14) + ax.axis('off') + return + + # Categorize API functions + api_data = pd.DataFrame(self.analysis_results['api_distribution']) + + # Define categories + categories = { + 'Kernel Execution': ['cudaLaunchKernel', 'cudaLaunchCooperativeKernel'], + 'Memory Operations': ['cudaMalloc', 'cudaFree', 'cudaMemcpy', 'cudaMemcpyAsync'], + 'Synchronization': ['cudaStreamSynchronize', 'cudaDeviceSynchronize', 'cudaEventSynchronize'], + 'Stream Management': ['cudaStreamCreate', 'cudaStreamDestroy'], + 'Event Management': ['cudaEventRecord', 'cudaEventElapsedTime'], + 'Other': [] + } + + # Categorize each API function + api_categories = {} + for api in api_data['API Function']: + categorized = False + for category, apis in categories.items(): + if api in apis: + api_categories[api] = category + categorized = True + break + if not categorized: + api_categories[api] = 'Other' + + # Add category to dataframe + api_data['Category'] = api_data['API Function'].map(api_categories) + + # Aggregate by category + category_counts = api_data.groupby('Category')['Count'].sum().reset_index() + + # Create pie chart + colors = plt.cm.tab10(np.linspace(0, 1, len(category_counts))) + wedges, texts, autotexts = ax.pie( + category_counts['Count'], + labels=category_counts['Category'], + autopct='%1.1f%%', + startangle=90, + colors=colors + ) + + # Style the chart + plt.setp(autotexts, size=10, weight='bold') + ax.set_title('Distribution of CUDA API Calls by Category', fontsize=16) + + return ax + + def _create_api_proportion_chart(self, fig, grid_pos): + """Create a chart showing the proportion of different API calls""" + ax = fig.add_subplot(grid_pos) + + if not self.analysis_results or 'api_distribution' not in self.analysis_results: + ax.text(0.5, 0.5, "No API distribution data available", + ha='center', va='center', fontsize=14) + ax.axis('off') + return + + # Get API distribution data + api_data = pd.DataFrame(self.analysis_results['api_distribution']) + + # Calculate total count + total_count = api_data['Count'].sum() + + # Calculate percentage + api_data['Percentage'] = (api_data['Count'] / total_count) * 100 + + # Sort by percentage + api_data = api_data.sort_values('Percentage', ascending=False) + + # Take top 10 and group the rest as "Other" + if len(api_data) > 10: + top_10 = api_data.iloc[:10] + other = pd.DataFrame({ + 'API Function': ['Other'], + 'Count': [api_data.iloc[10:]['Count'].sum()], + 'Percentage': [api_data.iloc[10:]['Percentage'].sum()] + }) + api_data = pd.concat([top_10, other]) + + # Create horizontal bar chart + colors = plt.cm.viridis(np.linspace(0, 1, len(api_data))) + bars = ax.barh(api_data['API Function'], api_data['Percentage'], color=colors) + + # Add percentage labels + for bar in bars: + width = bar.get_width() + label_x_pos = width if width > 1 else width + 0.5 + ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{width:.1f}%', + va='center', fontsize=10) + + # Style the chart + ax.set_xlabel('Percentage of Total API Calls', fontsize=12) + ax.set_title('Proportion of CUDA API Calls', fontsize=16) + ax.grid(axis='x', alpha=0.3) + + return ax + + def _create_performance_insights_chart(self, fig, grid_pos): + """Create a chart with performance insights""" + ax = fig.add_subplot(grid_pos) + + # Check if we have memory operations data + if not self.analysis_results or 'memory_operations' not in self.analysis_results: + ax.text(0.5, 0.5, "No memory operations data available for performance insights", + ha='center', va='center', fontsize=14) + ax.axis('off') + return + + # Get memory operations data + memory_data = pd.DataFrame(self.analysis_results['memory_operations']) + + # Create a text box with performance insights + insights = [ + "Performance Insights:", + "------------------------" + ] + + # Add insights based on available data + if 'memory_operations' in self.analysis_results: + mem_ops = {op['Memory Operation']: op['Count'] for op in self.analysis_results['memory_operations']} + + # Check memory copy vs compute ratio + if 'cudaMemcpy' in mem_ops and 'cudaLaunchKernel' in self.summary.get('top_api_functions', {}): + memcpy_count = mem_ops.get('cudaMemcpy', 0) + mem_ops.get('cudaMemcpyAsync', 0) + kernel_count = self.summary.get('top_api_functions', {}).get('cudaLaunchKernel', 0) + + if kernel_count > 0: + ratio = memcpy_count / kernel_count + insights.append(f"β€’ Memory Copy to Kernel Launch Ratio: {ratio:.2f}") + + if ratio > 1: + insights.append(" ⚠️ High memory transfer overhead detected") + insights.append(" πŸ’‘ Consider using pinned memory or CUDA streams") + else: + insights.append(" βœ“ Good balance between memory transfers and computation") + + # Check synchronization frequency + sync_count = 0 + for op in ['cudaStreamSynchronize', 'cudaDeviceSynchronize', 'cudaEventSynchronize']: + if op in self.summary.get('top_api_functions', {}): + sync_count += self.summary.get('top_api_functions', {}).get(op, 0) + + if sync_count > 0: + insights.append(f"β€’ Synchronization Operations: {sync_count}") + if sync_count > 20: + insights.append(" ⚠️ High synchronization frequency detected") + insights.append(" πŸ’‘ Consider reducing synchronization points") + else: + insights.append(" βœ“ Reasonable synchronization frequency") + + # Add kernel launch insights + if 'kernel_name_distribution' in self.analysis_results: + kernel_data = pd.DataFrame(self.analysis_results['kernel_name_distribution']) + if not kernel_data.empty: + insights.append(f"β€’ Most frequently launched kernel: {kernel_data.iloc[0]['Kernel Name']}") + insights.append(f" Launch count: {kernel_data.iloc[0]['Count']}") + + # Add grid/block dimension insights + if 'grid_block_dimensions' in self.analysis_results: + grid_block = pd.DataFrame(self.analysis_results['grid_block_dimensions']) + if not grid_block.empty: + grid_x = grid_block[(grid_block['Dimension'] == 'grid_x') & (grid_block['Value'] != 0)] + block_x = grid_block[(grid_block['Dimension'] == 'block_x') & (grid_block['Value'] != 0)] + + if not grid_x.empty and not block_x.empty: + insights.append(f"β€’ Common grid/block configuration:") + insights.append(f" Grid: ({grid_x.iloc[0]['Value']}, y, z), Block: ({block_x.iloc[0]['Value']}, y, z)") + + # Add occupancy hint + if isinstance(block_x.iloc[0]['Value'], (int, float)) and block_x.iloc[0]['Value'] < 128: + insights.append(" ⚠️ Small block size may lead to low GPU occupancy") + insights.append(" πŸ’‘ Consider increasing threads per block (ideal: 128-256)") + + # Add timeline insights + if 'temporal_distribution' in self.analysis_results: + insights.append("β€’ Temporal execution pattern:") + insights.append(" βœ“ Consistent API call distribution over time") + + # Create the text box + props = dict(boxstyle='round', facecolor='lightblue', alpha=0.5) + ax.text(0.05, 0.95, '\n'.join(insights), transform=ax.transAxes, fontsize=12, + verticalalignment='top', bbox=props, linespacing=1.5) + + ax.set_title('Performance Insights', fontsize=16) + ax.axis('off') + + return ax + + def enhance_individual_visualizations(self): + """Enhance individual visualizations with better styling and annotations""" + print("Enhancing individual visualizations...") + + # Get list of all PNG files in the analysis directory + png_files = [f for f in os.listdir(self.analysis_dir) if f.endswith('.png')] + + enhanced_files = [] + + for png_file in png_files: + input_path = os.path.join(self.analysis_dir, png_file) + output_path = os.path.join(self.output_dir, f"enhanced_{png_file}") + + try: + # Load the image + img = Image.open(input_path) + + # Save with higher quality + img.save(output_path, dpi=(300, 300)) + + enhanced_files.append(output_path) + print(f"Enhanced {png_file}") + except Exception as e: + print(f"Error enhancing {png_file}: {e}") + + return enhanced_files + + def create_html_report(self): + """Create an HTML report with all visualizations and insights""" + print("Creating HTML report...") + + # Get list of all PNG files in the analysis directory + png_files = [f for f in os.listdir(self.analysis_dir) if f.endswith('.png')] + + # Create HTML content + html_content = f""" + + + + CUDA Trace Analysis Report + + + +
+

CUDA Trace Analysis Report

+ +
+

Summary

+

Total trace entries: {self.summary.get('total_trace_entries', 'Unknown')}

+

Unique API functions: {self.summary.get('unique_api_functions', 'Unknown')}

+

Unique kernels: {self.summary.get('unique_kernels', 'Unknown')}

+

Trace duration: {self.summary.get('trace_duration_seconds', 'Unknown')} seconds

+
+ +

Dashboard

+
+ CUDA Trace Analysis Dashboard +
+ +

Individual Visualizations

+
+ """ + + # Add individual visualizations + for png_file in png_files: + html_content += f""" +
+

{png_file.replace('.png', '').replace('_', ' ').title()}

+ {png_file} +
+ """ + + # Add enhanced visualizations + enhanced_files = [f for f in os.listdir(self.output_dir) if f.endswith('.png') and f != "cuda_trace_dashboard.png"] + if enhanced_files: + html_content += f""" +
+ +

Enhanced Visualizations

+
+ """ + + for enhanced_file in enhanced_files: + html_content += f""" +
+

{enhanced_file.replace('.png', '').replace('_', ' ').title()}

+ {enhanced_file} +
+ """ + + # Add API distribution table + if 'api_distribution' in self.analysis_results: + html_content += f""" +
+ +

API Distribution

+ + + + + + """ + + for api in self.analysis_results['api_distribution']: + html_content += f""" + + + + + """ + + html_content += """ +
API FunctionCount
{api['API Function']}{api['Count']}
+ """ + + # Close HTML + html_content += """ +
+ + + """ + + # Save HTML report + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + report_dir = os.path.join(self.output_dir, "html_report") + if not os.path.exists(report_dir): + os.makedirs(report_dir) + + report_path = os.path.join(report_dir, f"cuda_trace_analysis_report_{timestamp}.html") + with open(report_path, 'w') as f: + f.write(html_content) + + print(f"HTML report saved to {report_path}") + return report_path + + def run(self): + """Run all visualization enhancement and organization tasks""" + print("Running visualization organizer...") + + # Create dashboard + dashboard_path = self.create_dashboard() + + # Enhance individual visualizations + enhanced_files = self.enhance_individual_visualizations() + + # Create HTML report + report_path = self.create_html_report() + + print("Visualization organization complete.") + return { + "dashboard": dashboard_path, + "enhanced_files": enhanced_files, + "report": report_path + } + +if __name__ == "__main__": + # Parse command line arguments + parser = argparse.ArgumentParser(description="CUDA Trace Visualization Organizer") + parser.add_argument("analysis_dir", help="Path to the analysis results directory") + parser.add_argument("--output_dir", help="Output directory for enhanced visualizations") + + args = parser.parse_args() + + # Initialize and run the organizer + organizer = CUDAVisualizationOrganizer(args.analysis_dir, args.output_dir) + results = organizer.run() + + print("Visualization organization complete.") diff --git a/llm-analysis/enhanced_cuda_llm_analyzer.py b/llm-analysis/enhanced_cuda_llm_analyzer.py new file mode 100644 index 0000000..ba5ade8 --- /dev/null +++ b/llm-analysis/enhanced_cuda_llm_analyzer.py @@ -0,0 +1,1647 @@ +#!/usr/bin/env python3 +""" +Enhanced CUDA Trace LLM Analyzer - Integrates LLM analysis with visualizations and tables +""" + +import os +import json +import base64 +import requests +from PIL import Image +import io +import argparse +import sys +import time +from datetime import datetime +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +from jinja2 import Template, Environment, FileSystemLoader +from openai import OpenAI + +class EnhancedCUDATraceLLMAnalyzer: + """Integrates LLM analysis with visualizations and tables for CUDA trace data""" + + def __init__(self, analysis_dir, enhanced_dir=None): + """Initialize the enhanced LLM analyzer with the path to the analysis results directory""" + self.analysis_dir = analysis_dir + self.enhanced_dir = enhanced_dir or os.path.join(analysis_dir, "enhanced") + self.output_dir = os.path.join(analysis_dir, "llm_analysis") + self.html_output_dir = os.path.join(analysis_dir, "html_report") + + # Create output directories if they don't exist + for directory in [self.output_dir, self.html_output_dir]: + if not os.path.exists(directory): + os.makedirs(directory) + + # Load analysis results + self.analysis_results = self._load_analysis_results() + self.summary = self._load_summary() + + # Initialize section analyses + self.section_analyses = {} + + def _load_analysis_results(self): + """Load the analysis results from JSON file""" + results_path = os.path.join(self.analysis_dir, "analysis_results.json") + if os.path.exists(results_path): + with open(results_path, 'r') as f: + return json.load(f) + return {} + + def _load_summary(self): + """Load the analysis summary from JSON file""" + summary_path = os.path.join(self.analysis_dir, "analysis_summary.json") + if os.path.exists(summary_path): + with open(summary_path, 'r') as f: + return json.load(f) + return {} + + def _encode_image(self, image_path): + """Encode an image to base64 for LLM API""" + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + def _generate_tables(self): + """Generate HTML and Markdown tables from analysis results""" + tables = {} + + # API Distribution Table + if 'api_distribution' in self.analysis_results: + api_data = pd.DataFrame(self.analysis_results['api_distribution']) + if not api_data.empty: + # Calculate percentage + total_calls = api_data['Count'].sum() + api_data['Percentage'] = (api_data['Count'] / total_calls * 100).round(2) + + # Format as HTML and Markdown + api_html = api_data.head(10).to_html(index=False, classes="data-table") + api_md = api_data.head(10).to_markdown(index=False) + + tables['api_distribution'] = { + 'html': api_html, + 'markdown': api_md, + 'data': api_data.head(10).to_dict('records') + } + + # Kernel Launch Table + if 'kernel_name_distribution' in self.analysis_results: + kernel_data = pd.DataFrame(self.analysis_results['kernel_name_distribution']) + if not kernel_data.empty: + # Calculate percentage + total_launches = kernel_data['Count'].sum() + kernel_data['Percentage'] = (kernel_data['Count'] / total_launches * 100).round(2) + + # Format as HTML and Markdown + kernel_html = kernel_data.head(10).to_html(index=False, classes="data-table") + kernel_md = kernel_data.head(10).to_markdown(index=False) + + tables['kernel_distribution'] = { + 'html': kernel_html, + 'markdown': kernel_md, + 'data': kernel_data.head(10).to_dict('records') + } + + # Memory Operations Table + if 'memory_operations' in self.analysis_results: + memory_data = pd.DataFrame(self.analysis_results['memory_operations']) + if not memory_data.empty: + # Calculate percentage + total_ops = memory_data['Count'].sum() + memory_data['Percentage'] = (memory_data['Count'] / total_ops * 100).round(2) + + # Format as HTML and Markdown + memory_html = memory_data.to_html(index=False, classes="data-table") + memory_md = memory_data.to_markdown(index=False) + + tables['memory_operations'] = { + 'html': memory_html, + 'markdown': memory_md, + 'data': memory_data.to_dict('records') + } + + # Grid/Block Dimensions Table + if 'grid_block_dimensions' in self.analysis_results: + grid_block_data = pd.DataFrame(self.analysis_results['grid_block_dimensions']) + if not grid_block_data.empty: + # Pivot the data for better presentation + pivot_data = grid_block_data.pivot_table( + index='Dimension', + columns='Value', + values='Count', + fill_value=0 + ).reset_index() + + # Format as HTML and Markdown + grid_block_html = pivot_data.to_html(classes="data-table") + grid_block_md = pivot_data.to_markdown() + + tables['grid_block_dimensions'] = { + 'html': grid_block_html, + 'markdown': grid_block_md, + 'data': grid_block_data.to_dict('records') + } + + # Performance Bottlenecks Table (derived from analysis) + bottlenecks = [] + + # Check for memory transfer bottlenecks + if 'memory_operations' in self.analysis_results: + memory_ops = pd.DataFrame(self.analysis_results['memory_operations']) + memcpy_ops = memory_ops[memory_ops['Memory Operation'].str.contains('cudaMemcpy', na=False)] + if not memcpy_ops.empty: + memcpy_count = memcpy_ops['Count'].sum() + + # Check if memcpy operations are a significant portion of all operations + if 'api_distribution' in self.analysis_results: + api_data = pd.DataFrame(self.analysis_results['api_distribution']) + total_ops = api_data['Count'].sum() + memcpy_percentage = (memcpy_count / total_ops) * 100 + + if memcpy_percentage > 20: # Threshold for bottleneck + bottlenecks.append({ + 'Bottleneck': 'Memory Transfer Overhead', + 'Metric': f'{memcpy_percentage:.2f}% of operations', + 'Severity': 'High' if memcpy_percentage > 30 else 'Medium', + 'Recommendation': 'Use pinned memory, batch transfers, or keep data on GPU longer' + }) + + # Check for synchronization bottlenecks + sync_ops = ['cudaStreamSynchronize', 'cudaDeviceSynchronize', 'cudaEventSynchronize'] + sync_count = 0 + + if 'api_distribution' in self.analysis_results: + api_data = pd.DataFrame(self.analysis_results['api_distribution']) + for op in sync_ops: + sync_op = api_data[api_data['API Function'] == op] + if not sync_op.empty: + sync_count += sync_op.iloc[0]['Count'] + + if sync_count > 0: + total_ops = api_data['Count'].sum() + sync_percentage = (sync_count / total_ops) * 100 + + if sync_percentage > 5: # Threshold for bottleneck + bottlenecks.append({ + 'Bottleneck': 'Excessive Synchronization', + 'Metric': f'{sync_count} sync operations ({sync_percentage:.2f}%)', + 'Severity': 'High' if sync_percentage > 10 else 'Medium', + 'Recommendation': 'Reduce synchronization points, use multiple streams for parallelism' + }) + + # Check for small kernel launches + if 'grid_block_dimensions' in self.analysis_results: + grid_block_data = pd.DataFrame(self.analysis_results['grid_block_dimensions']) + block_x = grid_block_data[(grid_block_data['Dimension'] == 'block_x') & (grid_block_data['Value'] != 0)] + + if not block_x.empty and isinstance(block_x.iloc[0]['Value'], (int, float)): + block_size = block_x.iloc[0]['Value'] + if block_size < 128: + bottlenecks.append({ + 'Bottleneck': 'Low GPU Occupancy', + 'Metric': f'Block size: {block_size} threads', + 'Severity': 'Medium', + 'Recommendation': 'Increase threads per block (ideal: 128-256)' + }) + + # Create bottlenecks table + if bottlenecks: + bottlenecks_df = pd.DataFrame(bottlenecks) + bottlenecks_html = bottlenecks_df.to_html(index=False, classes="data-table") + bottlenecks_md = bottlenecks_df.to_markdown(index=False) + + tables['performance_bottlenecks'] = { + 'html': bottlenecks_html, + 'markdown': bottlenecks_md, + 'data': bottlenecks + } + + return tables + + def _prepare_section_prompts(self): + """Prepare prompts for each section of the analysis""" + tables = self._generate_tables() + prompts = {} + + # Overview Section Prompt + overview_prompt = """ + # CUDA Trace Analysis - Overview + + I need a comprehensive overview of the CUDA trace data provided. The data comes from a BPF program that traces various CUDA API routines. + + ## Analysis Task + + Please provide a high-level executive summary of the CUDA trace data that includes: + + 1. The main characteristics of the application based on its CUDA API usage + 2. The most significant patterns observed in the trace data + 3. The key performance considerations identified + 4. A brief assessment of the application's CUDA implementation quality + + ## Available Data + + """ + + # Add summary information + if self.summary: + overview_prompt += "### Summary Statistics\n\n" + overview_prompt += f"- Total trace entries: {self.summary.get('total_trace_entries', 'Unknown')}\n" + overview_prompt += f"- Unique API functions: {self.summary.get('unique_api_functions', 'Unknown')}\n" + overview_prompt += f"- Unique kernels: {self.summary.get('unique_kernels', 'Unknown')}\n" + overview_prompt += f"- Trace duration: {self.summary.get('trace_duration_seconds', 'Unknown')} seconds\n\n" + + prompts['overview'] = overview_prompt + + # API Distribution Section Prompt + api_prompt = """ + # CUDA Trace Analysis - API Distribution + + I need a detailed analysis of the CUDA API distribution in the trace data. + + ## Analysis Task + + Please analyze the API distribution data and provide: + + 1. Insights into the most frequently used CUDA API functions and what they indicate about the application + 2. Analysis of the balance between different types of operations (compute, memory, synchronization) + 3. Identification of any unusual or inefficient API usage patterns + 4. Recommendations for optimizing the API usage + + ## Available Data + + """ + + # Add API distribution information + if 'api_distribution' in tables: + api_prompt += "### API Distribution Table\n\n" + api_prompt += tables['api_distribution']['markdown'] + "\n\n" + + # Reference to visualization + api_prompt += "### API Distribution Visualization\n\n" + api_prompt += "The API distribution visualization (Figure 1) shows the frequency of different CUDA API calls.\n\n" + + prompts['api_distribution'] = api_prompt + + # Memory Operations Section Prompt + memory_prompt = """ + # CUDA Trace Analysis - Memory Operations + + I need a detailed analysis of the memory operations in the CUDA trace data. + + ## Analysis Task + + Please analyze the memory operations data and provide: + + 1. Assessment of the memory transfer patterns and their efficiency + 2. Analysis of the balance between different types of memory operations + 3. Identification of potential memory-related bottlenecks + 4. Recommendations for optimizing memory usage and transfers + + ## Available Data + + """ + + # Add memory operations information + if 'memory_operations' in tables: + memory_prompt += "### Memory Operations Table\n\n" + memory_prompt += tables['memory_operations']['markdown'] + "\n\n" + + # Reference to visualization + memory_prompt += "### Memory Operations Visualization\n\n" + memory_prompt += "The memory operations visualization (Figure 2) shows the distribution of memory-related operations.\n\n" + + prompts['memory_operations'] = memory_prompt + + # Kernel Launch Section Prompt + kernel_prompt = """ + # CUDA Trace Analysis - Kernel Launches + + I need a detailed analysis of the kernel launch patterns in the CUDA trace data. + + ## Analysis Task + + Please analyze the kernel launch data and provide: + + 1. Assessment of the kernel launch patterns and their implications for performance + 2. Analysis of the grid and block dimensions used for kernel launches + 3. Evaluation of kernel occupancy and efficiency based on the launch parameters + 4. Recommendations for optimizing kernel launch configurations + + ## Available Data + + """ + + # Add kernel distribution information + if 'kernel_distribution' in tables: + kernel_prompt += "### Kernel Distribution Table\n\n" + kernel_prompt += tables['kernel_distribution']['markdown'] + "\n\n" + + # Add grid/block dimensions information + if 'grid_block_dimensions' in tables: + kernel_prompt += "### Grid/Block Dimensions Table\n\n" + kernel_prompt += tables['grid_block_dimensions']['markdown'] + "\n\n" + + # Reference to visualization + kernel_prompt += "### Kernel Launch Visualization\n\n" + kernel_prompt += "The kernel launch visualization (Figure 3) shows the distribution of kernel launches.\n\n" + + prompts['kernel_launches'] = kernel_prompt + + # Performance Bottlenecks Section Prompt + bottlenecks_prompt = """ + # CUDA Trace Analysis - Performance Bottlenecks + + I need a detailed analysis of the performance bottlenecks identified in the CUDA trace data. + + ## Analysis Task + + Please analyze the performance bottlenecks data and provide: + + 1. Detailed explanation of each identified bottleneck and its impact on performance + 2. Root cause analysis for each bottleneck + 3. Prioritized recommendations for addressing each bottleneck + 4. Potential performance gains from implementing the recommendations + + ## Available Data + + """ + + # Add bottlenecks information + if 'performance_bottlenecks' in tables: + bottlenecks_prompt += "### Performance Bottlenecks Table\n\n" + bottlenecks_prompt += tables['performance_bottlenecks']['markdown'] + "\n\n" + + # Reference to visualization + bottlenecks_prompt += "### Performance Timeline Visualization\n\n" + bottlenecks_prompt += "The performance timeline visualization (Figure 4) shows the API calls over time, which can help identify bottlenecks.\n\n" + + prompts['performance_bottlenecks'] = bottlenecks_prompt + + # Optimization Recommendations Section Prompt + optimization_prompt = """ + # CUDA Trace Analysis - Optimization Recommendations + + I need detailed optimization recommendations based on the CUDA trace data analysis. + + ## Analysis Task + + Please provide comprehensive optimization recommendations that include: + + 1. Specific code-level optimizations with examples where possible + 2. Architectural changes to improve performance + 3. Alternative approaches or CUDA features that could be leveraged + 4. Prioritization of recommendations based on expected impact + + ## Available Data + + """ + + # Add summary of findings from other sections + optimization_prompt += "### Summary of Findings\n\n" + optimization_prompt += "Based on the analysis of the trace data, please provide optimization recommendations that address the identified issues.\n\n" + + # Reference to dashboard + optimization_prompt += "### CUDA Trace Dashboard\n\n" + optimization_prompt += "The CUDA trace dashboard (Figure 5) provides an overview of all aspects of the trace data.\n\n" + + prompts['optimization_recommendations'] = optimization_prompt + + return prompts, tables + + def analyze_with_openai(self, api_key=None): + """Analyze the trace data using OpenAI's API with enhanced visualization integration""" + if not api_key: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("Error: OpenAI API key not provided. Please set the OPENAI_API_KEY environment variable or provide it as an argument.") + return None + + print("Analyzing trace data with OpenAI's API (enhanced visualization integration)...") + + # Initialize the OpenAI client + client = OpenAI(api_key=api_key) + + # Prepare section prompts and tables + section_prompts, tables = self._prepare_section_prompts() + + # Get list of visualization images + dashboard_path = os.path.join(self.enhanced_dir, "cuda_trace_dashboard.png") + api_dist_path = os.path.join(self.analysis_dir, "api_distribution.png") + kernel_path = os.path.join(self.analysis_dir, "kernel_name_distribution.png") + memory_path = os.path.join(self.analysis_dir, "memory_operations.png") + timeline_path = os.path.join(self.analysis_dir, "api_call_timeline.png") + + # Encode images + encoded_images = {} + for img_id, img_path in [ + ("dashboard", dashboard_path), + ("api_distribution", api_dist_path), + ("kernel_distribution", kernel_path), + ("memory_operations", memory_path), + ("timeline", timeline_path) + ]: + if os.path.exists(img_path): + try: + encoded_images[img_id] = self._encode_image(img_path) + except Exception as e: + print(f"Error encoding image {img_path}: {e}") + + # Analyze each section + for section, prompt in section_prompts.items(): + print(f"Analyzing section: {section}") + + # Prepare the message content + content = [ + { + "type": "text", + "text": prompt + } + ] + + # Add relevant images for this section + if section == 'overview' and 'dashboard' in encoded_images: + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{encoded_images['dashboard']}" + } + }) + elif section == 'api_distribution' and 'api_distribution' in encoded_images: + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{encoded_images['api_distribution']}" + } + }) + elif section == 'memory_operations' and 'memory_operations' in encoded_images: + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{encoded_images['memory_operations']}" + } + }) + elif section == 'kernel_launches' and 'kernel_distribution' in encoded_images: + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{encoded_images['kernel_distribution']}" + } + }) + + try: + # Make the API request using the OpenAI client + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": content + } + ], + max_tokens=4000 + ) + + # Parse the response + if response.choices and len(response.choices) > 0: + analysis = response.choices[0].message.content + + # Save the section analysis + self.section_analyses[section] = analysis + + # Save the analysis to a file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.output_dir, f"llm_analysis_{section}_{timestamp}.md") + with open(output_path, "w") as f: + f.write(analysis) + + print(f"Analysis for section '{section}' saved to {output_path}") + else: + print(f"Error: Unexpected response format from OpenAI API for section '{section}'") + + except Exception as e: + print(f"Error calling OpenAI API for section '{section}': {e}") + + # Generate the combined analysis + self._generate_combined_analysis() + + # Generate the HTML report + self._generate_html_report(tables, encoded_images) + + return self.section_analyses + + def analyze_with_local_llm(self, model_endpoint="http://localhost:8000/v1/chat/completions"): + """Analyze the trace data using a local LLM API endpoint with enhanced visualization integration""" + print(f"Analyzing trace data with local LLM at {model_endpoint} (enhanced visualization integration)...") + + # Prepare section prompts and tables + section_prompts, tables = self._prepare_section_prompts() + + # Analyze each section + for section, prompt in section_prompts.items(): + print(f"Analyzing section: {section}") + + # Prepare the API request + headers = { + "Content-Type": "application/json" + } + + # Prepare the API request payload + payload = { + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "max_tokens": 4000 + } + + try: + # Make the API request + response = requests.post( + model_endpoint, + headers=headers, + json=payload + ) + + # Check for errors + response.raise_for_status() + + # Parse the response + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + analysis = result["choices"][0]["message"]["content"] + + # Save the section analysis + self.section_analyses[section] = analysis + + # Save the analysis to a file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.output_dir, f"llm_analysis_{section}_{timestamp}.md") + with open(output_path, "w") as f: + f.write(analysis) + + print(f"Analysis for section '{section}' saved to {output_path}") + else: + print(f"Error: Unexpected response format from local LLM API for section '{section}'") + + except Exception as e: + print(f"Error calling local LLM API for section '{section}': {e}") + + # Generate the combined analysis + self._generate_combined_analysis() + + # Generate the HTML report + self._generate_html_report(tables, {}) + + return self.section_analyses + + def generate_mock_analysis(self): + """Generate a mock analysis for testing purposes""" + print("Generating mock analysis...") + + # Prepare section prompts and tables + section_prompts, tables = self._prepare_section_prompts() + + # Generate mock analysis for each section + for section in section_prompts.keys(): + print(f"Generating mock analysis for section: {section}") + + # Generate a mock analysis based on the section + if section == 'overview': + analysis = self._generate_mock_overview() + elif section == 'api_distribution': + analysis = self._generate_mock_api_distribution() + elif section == 'memory_operations': + analysis = self._generate_mock_memory_operations() + elif section == 'kernel_launches': + analysis = self._generate_mock_kernel_launches() + elif section == 'performance_bottlenecks': + analysis = self._generate_mock_performance_bottlenecks() + elif section == 'optimization_recommendations': + analysis = self._generate_mock_optimization_recommendations() + else: + analysis = f"# Mock Analysis for {section}\n\nThis is a placeholder for the {section} analysis." + + # Save the section analysis + self.section_analyses[section] = analysis + + # Save the analysis to a file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.output_dir, f"llm_analysis_{section}_{timestamp}.md") + with open(output_path, "w") as f: + f.write(analysis) + + print(f"Mock analysis for section '{section}' saved to {output_path}") + + # Generate the combined analysis + self._generate_combined_analysis() + + # Generate the HTML report + self._generate_html_report(tables, {}) + + return self.section_analyses + + def _generate_mock_overview(self): + """Generate a mock overview analysis""" + return """# CUDA Trace Analysis Overview + +## Executive Summary + +Based on the CUDA trace data provided, this application appears to be a **compute-intensive application with significant memory transfer operations**. The trace reveals a pattern of kernel launches interspersed with memory operations, suggesting a batch processing workflow where data is transferred to the GPU, processed, and then results are transferred back to the host. + +## Key Characteristics + +1. **Balanced API Usage**: The application shows a relatively balanced distribution of CUDA API calls across memory management, kernel execution, and synchronization operations. + +2. **Regular Kernel Launch Pattern**: The kernel launch pattern suggests a structured, iterative computation approach, likely processing data in batches or chunks. + +3. **Memory Transfer Overhead**: A significant portion of the trace is dedicated to memory transfer operations, indicating potential for optimization in this area. + +4. **Synchronization Points**: The trace shows regular synchronization points, which may be limiting the potential for overlapping operations. + +## Performance Considerations + +- **Memory Transfer Bottleneck**: The high frequency of memory transfer operations suggests this could be a performance bottleneck. +- **Synchronization Overhead**: Regular synchronization points may be limiting parallelism and overall performance. +- **Kernel Efficiency**: Some kernels show suboptimal grid and block dimensions, potentially underutilizing the GPU. + +## Implementation Quality Assessment + +The CUDA implementation appears to be **functionally correct but with room for optimization**. The structured approach suggests a methodical implementation, but the frequency of memory transfers and synchronization points indicates that the application may not be fully leveraging CUDA's parallel processing capabilities. + +Overall, this appears to be a **moderate-quality CUDA implementation** that could benefit from targeted optimizations to reduce memory transfer overhead and increase parallelism. +""" + + def _generate_mock_api_distribution(self): + """Generate a mock API distribution analysis""" + return """# CUDA API Distribution Analysis + +## Key Insights + +The API distribution reveals a **compute-focused application with significant memory management overhead**. The most frequently used API functions provide valuable insights into the application's behavior and potential optimization opportunities. + +## Most Frequently Used API Functions + +1. **cudaLaunchKernel (32.5%)**: The high frequency of kernel launches indicates a compute-intensive application. This is expected for a CUDA application, as the primary purpose is to offload computation to the GPU. + +2. **cudaMemcpy (18.7%)**: The significant proportion of memory copy operations suggests substantial data transfer between host and device. This could be a performance bottleneck, especially if these transfers are not overlapped with computation. + +3. **cudaMalloc (12.3%)**: The frequency of memory allocation calls indicates that the application may be allocating memory frequently during execution rather than pre-allocating at initialization. + +4. **cudaStreamSynchronize (8.5%)**: The presence of stream synchronization calls suggests the application is using CUDA streams, but the relatively high frequency may indicate excessive synchronization. + +5. **cudaFree (7.9%)**: The proportion of memory deallocation calls matches closely with allocation calls, suggesting good memory management practices. + +## Operation Type Balance + +- **Compute Operations**: ~35% (primarily cudaLaunchKernel) +- **Memory Operations**: ~40% (cudaMemcpy, cudaMalloc, cudaFree) +- **Synchronization Operations**: ~15% (cudaStreamSynchronize, cudaDeviceSynchronize) +- **Other Operations**: ~10% (configuration, device management, etc.) + +This distribution shows a relatively balanced application with a slight emphasis on memory operations, which is common in many CUDA applications but may indicate optimization opportunities. + +## Inefficient API Usage Patterns + +1. **Frequent Memory Allocation/Deallocation**: The high frequency of cudaMalloc and cudaFree calls suggests memory is being allocated and deallocated frequently, which is inefficient. Consider pooling or pre-allocating memory. + +2. **Excessive Synchronization**: The proportion of synchronization calls is relatively high, potentially limiting parallelism and performance. + +3. **High Memory Transfer Volume**: The significant proportion of cudaMemcpy calls indicates substantial data movement, which is often a performance bottleneck in CUDA applications. + +## Optimization Recommendations + +1. **Implement Memory Pooling**: Replace frequent cudaMalloc/cudaFree calls with a memory pool to reuse allocated memory. + +2. **Reduce Synchronization Points**: Minimize cudaStreamSynchronize calls by restructuring the application to allow more asynchronous operation. + +3. **Batch Memory Transfers**: Combine smaller memory transfers into larger batches to reduce overhead. + +4. **Use Pinned Memory**: For frequent host-device transfers, use pinned memory to improve transfer speeds. + +5. **Implement Asynchronous Memory Copies**: Use cudaMemcpyAsync with streams to overlap memory transfers with computation. + +By addressing these API usage patterns, the application could potentially achieve significant performance improvements, particularly in reducing memory management overhead and increasing parallelism. +""" + + def _generate_mock_memory_operations(self): + """Generate a mock memory operations analysis""" + return """# Memory Operations Analysis + +## Memory Transfer Patterns + +The trace data reveals several distinct memory transfer patterns that significantly impact the application's performance: + +1. **Bulk Data Transfers**: Large, infrequent transfers (typically at the beginning and end of major processing phases) + - These account for approximately 45% of total memory transfer volume + - Average transfer size: ~128MB + +2. **Regular Small Transfers**: Frequent, small transfers throughout execution + - These account for approximately 35% of total memory transfer volume + - Average transfer size: ~256KB + - Occur at regular intervals, suggesting iterative processing + +3. **Sporadic Medium Transfers**: Occasional medium-sized transfers + - These account for approximately 20% of total memory transfer volume + - Average transfer size: ~4MB + - Irregular pattern, possibly related to dynamic data requirements + +## Memory Operation Efficiency + +The memory operation efficiency shows several areas of concern: + +1. **Direction Imbalance**: Host-to-device transfers (cudaMemcpyHostToDevice) account for 65% of all transfers, while device-to-host transfers (cudaMemcpyDeviceToHost) account for 35%. This imbalance suggests that the application may be transferring more data to the GPU than necessary. + +2. **Transfer Size Distribution**: The prevalence of small transfers (< 1MB) indicates potential inefficiency, as each transfer incurs overhead regardless of size. + +3. **Temporal Clustering**: Memory operations show clustering in time, with periods of intense memory activity followed by computation. This sequential pattern suggests limited overlap between memory transfers and computation. + +## Memory-Related Bottlenecks + +Based on the analysis, the following memory-related bottlenecks have been identified: + +1. **Small Transfer Overhead**: The frequent small transfers incur significant overhead, potentially limiting overall throughput. + - Impact: Estimated 15-20% performance penalty + - Root cause: Granular data handling instead of batched processing + +2. **Synchronous Memory Operations**: Most memory operations appear to be synchronous, blocking the CPU and preventing overlap with computation. + - Impact: Estimated 10-15% performance penalty + - Root cause: Not utilizing asynchronous memory operations and CUDA streams effectively + +3. **Repeated Transfers**: Some data appears to be transferred multiple times between host and device. + - Impact: Estimated 5-10% redundant transfer volume + - Root cause: Possibly poor data locality or caching strategy + +## Optimization Recommendations + +To improve memory operation efficiency, consider the following recommendations: + +1. **Batch Small Transfers**: Combine small, frequent transfers into larger batches to reduce overhead. + - Implementation: Buffer data on the host side and transfer in larger chunks + - Expected impact: 10-15% reduction in memory transfer time + +2. **Use Asynchronous Memory Operations**: Implement cudaMemcpyAsync with CUDA streams to overlap memory transfers with computation. + - Implementation: Restructure code to use multiple streams and asynchronous transfers + - Expected impact: 15-20% overall performance improvement + +3. **Implement Pinned Memory**: Use pinned (page-locked) host memory for frequent transfers to improve bandwidth. + - Implementation: Replace standard malloc with cudaHostAlloc for transfer buffers + - Expected impact: 20-30% improvement in memory transfer speed + +4. **Reduce Host-Device Transfers**: Keep data on the GPU for longer periods to minimize transfers. + - Implementation: Restructure algorithms to maximize data reuse on the GPU + - Expected impact: 10-15% reduction in total transfer volume + +5. **Consider Unified Memory**: For appropriate workloads, evaluate using CUDA Unified Memory to simplify memory management. + - Implementation: Replace explicit memory management with unified memory allocations + - Expected impact: Simplified code and potentially improved performance for certain access patterns + +By implementing these recommendations, the application could significantly reduce memory-related bottlenecks and improve overall performance. +""" + + def _generate_mock_kernel_launches(self): + """Generate a mock kernel launches analysis""" + return """# Kernel Launch Analysis + +## Kernel Launch Patterns + +The trace data reveals several distinct kernel launch patterns that characterize the application's execution profile: + +1. **Primary Computation Kernels**: A set of 3-4 frequently launched kernels that appear to form the core computational workflow + - These account for approximately 65% of all kernel launches + - Launched in a consistent sequence, suggesting a pipeline or iterative algorithm + - Example kernels: `matrixMultiply`, `vectorAdd`, `dataTransform` + +2. **Preprocessing/Postprocessing Kernels**: Less frequent kernels that appear to handle data preparation and result processing + - These account for approximately 20% of all kernel launches + - Typically launched before and after the main computation sequence + - Example kernels: `dataPreprocess`, `resultNormalize` + +3. **Utility Kernels**: Infrequently launched kernels for specialized operations + - These account for approximately 15% of all kernel launches + - Irregular launch pattern, suggesting on-demand usage + - Example kernels: `errorCheck`, `memoryInitialize` + +## Grid and Block Dimensions Analysis + +The grid and block dimensions used for kernel launches show several patterns: + +1. **Block Size Distribution**: + - Most common block size: 256 threads (128 Γ— 1 Γ— 2) + - Range: 64 to 512 threads per block + - Observation: Block sizes are generally power-of-two values, which is good practice + +2. **Grid Size Distribution**: + - Highly variable grid sizes, ranging from small (10-20 blocks) to very large (10,000+ blocks) + - Primary computation kernels tend to use larger grid sizes + - Utility kernels typically use smaller grid sizes + +3. **Dimension Utilization**: + - Primarily 2D grid configurations (x and y dimensions) + - Block configurations predominantly 1D or 2D + - Limited use of 3D configurations, suggesting the application is not processing volumetric data + +## Kernel Occupancy and Efficiency + +Based on the launch configurations, the following observations about kernel occupancy and efficiency can be made: + +1. **Occupancy Concerns**: + - Some kernels use block sizes of 64 or 128 threads, which may lead to suboptimal occupancy on modern GPUs + - The primary computation kernels generally use appropriate block sizes (256 threads) for good occupancy + +2. **Execution Efficiency**: + - The consistent use of power-of-two block sizes suggests awareness of warp-based execution + - Some grid configurations may lead to imbalanced workloads across SMs (Streaming Multiprocessors) + +3. **Resource Utilization**: + - Without detailed kernel code, it's difficult to assess register and shared memory usage + - The launch patterns suggest compute-bound rather than memory-bound kernels + +## Optimization Recommendations + +To improve kernel launch efficiency, consider the following recommendations: + +1. **Optimize Block Sizes**: + - Increase block sizes for kernels currently using 64 or 128 threads to 256 threads where possible + - Implementation: Adjust kernel launch parameters + - Expected impact: 10-15% improvement in occupancy for affected kernels + +2. **Dynamic Grid Sizing**: + - Implement dynamic grid sizing based on problem dimensions and available GPU resources + - Implementation: Calculate optimal grid dimensions at runtime based on device properties + - Expected impact: Better load balancing and potentially 5-10% performance improvement + +3. **Kernel Fusion**: + - Consider combining some of the sequential kernels in the main computation pipeline + - Implementation: Merge compatible kernels to reduce launch overhead and improve data locality + - Expected impact: Reduced kernel launch overhead and potentially 10-20% performance improvement for the affected sequence + +4. **Persistent Threads**: + - For iterative algorithms, evaluate using persistent threads to reduce kernel launch overhead + - Implementation: Restructure kernels to process multiple iterations within a single launch + - Expected impact: Reduced launch overhead, potentially 5-10% improvement for iterative sections + +5. **Explore 3D Block Configurations**: + - For appropriate algorithms, consider 3D block configurations to better match data access patterns + - Implementation: Restructure thread indexing to utilize 3D block dimensions + - Expected impact: Improved memory access patterns, potentially 5-15% performance improvement for spatial algorithms + +By implementing these recommendations, the application could achieve better GPU utilization and improved overall performance through more efficient kernel execution. +""" + + def _generate_mock_performance_bottlenecks(self): + """Generate a mock performance bottlenecks analysis""" + return """# Performance Bottlenecks Analysis + +## Identified Bottlenecks + +Based on the trace data analysis, the following performance bottlenecks have been identified, listed in order of severity: + +### 1. Memory Transfer Overhead (High Severity) + +**Description**: Excessive time spent transferring data between host and device memory. + +**Impact**: Approximately 35% of total execution time is spent on memory transfers, making this the most significant bottleneck. + +**Root Cause**: The application performs frequent, small memory transfers instead of batching them. Additionally, most transfers appear to be synchronous, blocking further execution until completion. + +**Recommendations**: +- Batch small transfers into larger chunks +- Use asynchronous memory transfers (cudaMemcpyAsync) +- Implement pinned memory for faster transfer speeds +- Keep data on the GPU for longer periods to reduce transfer frequency + +**Expected Performance Gain**: 15-25% reduction in overall execution time. + +### 2. Excessive Synchronization (Medium Severity) + +**Description**: Frequent synchronization points that limit parallelism and prevent overlapping operations. + +**Impact**: Approximately 12% of total execution time is spent waiting at synchronization points. + +**Root Cause**: The application uses cudaStreamSynchronize and cudaDeviceSynchronize calls frequently, often after each kernel launch or memory operation. + +**Recommendations**: +- Reduce synchronization frequency by grouping operations +- Use multiple CUDA streams to enable operation overlap +- Implement asynchronous execution patterns +- Only synchronize when results are actually needed + +**Expected Performance Gain**: 8-12% reduction in overall execution time. + +### 3. Suboptimal Kernel Launch Configuration (Medium Severity) + +**Description**: Some kernels are launched with grid and block dimensions that do not maximize GPU utilization. + +**Impact**: Affected kernels show approximately 30-40% lower throughput than optimal. + +**Root Cause**: Block sizes are often too small (64 or 128 threads) for modern GPUs, and grid dimensions don't always balance work evenly across streaming multiprocessors. + +**Recommendations**: +- Increase block sizes to 256-512 threads where possible +- Implement dynamic grid sizing based on problem size and GPU capabilities +- Consider kernel fusion for frequently launched sequential kernels + +**Expected Performance Gain**: 5-10% reduction in overall execution time. + +### 4. Frequent Memory Allocation/Deallocation (Medium Severity) + +**Description**: Excessive time spent on cudaMalloc and cudaFree operations during execution. + +**Impact**: Approximately 8% of total execution time is spent on memory management. + +**Root Cause**: The application allocates and deallocates device memory frequently instead of reusing previously allocated buffers. + +**Recommendations**: +- Implement a memory pool to reuse allocated memory +- Pre-allocate memory at initialization when possible +- Use persistent allocations for iterative processing + +**Expected Performance Gain**: 3-8% reduction in overall execution time. + +### 5. Uncoalesced Memory Access Patterns (Low Severity) + +**Description**: Some kernels appear to have suboptimal memory access patterns, reducing memory throughput. + +**Impact**: Affected kernels show approximately 20-30% lower memory throughput than optimal. + +**Root Cause**: Based on the kernel launch patterns and memory operation distribution, it appears that some kernels may not be accessing memory in a coalesced manner. + +**Recommendations**: +- Restructure data layout to improve memory coalescing +- Adjust thread indexing to match memory access patterns +- Consider using shared memory for frequently accessed data + +**Expected Performance Gain**: 2-5% reduction in overall execution time. + +## Cumulative Impact + +If all bottlenecks were addressed, the application could potentially see a **30-45% reduction in overall execution time**. The most significant gains would come from addressing the memory transfer overhead and excessive synchronization issues. + +## Implementation Priority + +Based on the potential performance gains and implementation complexity, the recommended implementation priority is: + +1. Memory Transfer Optimization (highest impact, moderate complexity) +2. Synchronization Reduction (high impact, low complexity) +3. Kernel Launch Configuration Optimization (medium impact, low complexity) +4. Memory Pool Implementation (medium impact, medium complexity) +5. Memory Access Pattern Optimization (lower impact, high complexity) + +This prioritization maximizes the performance improvement while considering the implementation effort required. +""" + + def _generate_mock_optimization_recommendations(self): + """Generate a mock optimization recommendations analysis""" + return """# Optimization Recommendations + +## Executive Summary + +Based on the comprehensive analysis of the CUDA trace data, this application would benefit significantly from optimizations focused on memory management, kernel execution efficiency, and increased parallelism. The recommendations below are prioritized by expected impact and implementation feasibility. + +## High-Priority Optimizations + +### 1. Implement Asynchronous Memory Operations with CUDA Streams + +**Description**: Replace synchronous memory operations with asynchronous ones and use multiple CUDA streams to overlap memory transfers with computation. + +**Implementation**: +```cuda +// Instead of: +cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice); +kernel<<>>(d_data, ...); +cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost); + +// Use: +cudaStream_t stream1, stream2; +cudaStreamCreate(&stream1); +cudaStreamCreate(&stream2); + +// Overlap transfers and computation +cudaMemcpyAsync(d_data1, h_data1, size/2, cudaMemcpyHostToDevice, stream1); +cudaMemcpyAsync(d_data2, h_data2, size/2, cudaMemcpyHostToDevice, stream2); +kernel<<>>(d_data1, ...); +kernel<<>>(d_data2, ...); +cudaMemcpyAsync(h_result1, d_result1, size/2, cudaMemcpyDeviceToHost, stream1); +cudaMemcpyAsync(h_result2, d_result2, size/2, cudaMemcpyDeviceToHost, stream2); + +// Only synchronize when necessary +cudaStreamSynchronize(stream1); +cudaStreamSynchronize(stream2); +``` + +**Expected Impact**: 15-25% overall performance improvement by overlapping memory transfers with computation. + +### 2. Implement Memory Pooling + +**Description**: Replace frequent cudaMalloc/cudaFree calls with a memory pool that reuses previously allocated memory. + +**Implementation**: +```cuda +// Simple memory pool implementation +class CudaMemoryPool { +private: + std::map> free_memory; + +public: + void* allocate(size_t size) { + if (!free_memory[size].empty()) { + void* ptr = free_memory[size].back(); + free_memory[size].pop_back(); + return ptr; + } + void* ptr; + cudaMalloc(&ptr, size); + return ptr; + } + + void deallocate(void* ptr, size_t size) { + free_memory[size].push_back(ptr); + } + + void release() { + for (auto& pair : free_memory) { + for (void* ptr : pair.second) { + cudaFree(ptr); + } + } + free_memory.clear(); + } +}; +``` + +**Expected Impact**: 5-10% performance improvement by reducing memory allocation overhead. + +### 3. Use Pinned Host Memory for Frequent Transfers + +**Description**: Allocate host memory as pinned (page-locked) for data that is frequently transferred between host and device. + +**Implementation**: +```cuda +// Instead of: +float* h_data = (float*)malloc(size); + +// Use: +float* h_data; +cudaHostAlloc((void**)&h_data, size, cudaHostAllocDefault); + +// When done: +cudaFreeHost(h_data); +``` + +**Expected Impact**: 20-30% faster memory transfers, resulting in 5-15% overall performance improvement. + +## Medium-Priority Optimizations + +### 4. Optimize Kernel Launch Configurations + +**Description**: Adjust grid and block dimensions to maximize GPU occupancy and efficiency. + +**Implementation**: +```cuda +// Instead of fixed dimensions: +kernel<<>>(args...); + +// Calculate optimal dimensions: +int blockSize = 256; // Typically 256 or 512 for compute-bound kernels +int minGridSize; +int optimalBlockSize; +cudaOccupancyMaxPotentialBlockSize(&minGridSize, &optimalBlockSize, kernel, 0, 0); +int gridSize = (n + optimalBlockSize - 1) / optimalBlockSize; + +kernel<<>>(args...); +``` + +**Expected Impact**: 5-10% performance improvement for compute-bound kernels. + +### 5. Implement Kernel Fusion + +**Description**: Combine sequential kernels that operate on the same data to reduce kernel launch overhead and improve data locality. + +**Implementation**: +```cuda +// Instead of: +kernelA<<>>(d_data, ...); +kernelB<<>>(d_data, ...); + +// Create a fused kernel: +__global__ void fusedKernel(float* data, ...) { + // Code from kernelA + ... + + // Ensure all threads complete kernelA operations + __syncthreads(); + + // Code from kernelB + ... +} + +fusedKernel<<>>(d_data, ...); +``` + +**Expected Impact**: 3-8% performance improvement by reducing kernel launch overhead and improving data locality. + +## Lower-Priority Optimizations + +### 6. Implement Persistent Threads for Iterative Algorithms + +**Description**: Use persistent threads to process multiple iterations within a single kernel launch for iterative algorithms. + +**Implementation**: +```cuda +__global__ void persistentKernel(float* data, int numIterations, ...) { + // Thread setup + ... + + for (int iter = 0; iter < numIterations; iter++) { + // Process one iteration + ... + + // Synchronize threads between iterations + __syncthreads(); + } +} +``` + +**Expected Impact**: 2-5% performance improvement for highly iterative algorithms. + +### 7. Optimize Memory Access Patterns + +**Description**: Restructure data layout and access patterns to improve memory coalescing and reduce bank conflicts. + +**Implementation**: +```cuda +// For 2D data, consider using pitched memory: +cudaPitchedPtr d_pitchedData; +cudaMalloc3D(&d_pitchedData, make_cudaExtent(width * sizeof(float), height, 1)); + +// Access with stride awareness: +__global__ void optimizedKernel(cudaPitchedPtr data, ...) { + char* row = (char*)data.ptr + blockIdx.y * data.pitch; + float* element = (float*)(row + threadIdx.x * sizeof(float)); + // Now element points to the correct position with proper alignment +} +``` + +**Expected Impact**: 3-7% performance improvement for memory-bound kernels. + +## Architectural Recommendations + +### 8. Consider Unified Memory for Complex Data Structures + +**Description**: For applications with complex data structures and irregular access patterns, consider using CUDA Unified Memory to simplify memory management. + +**Implementation**: +```cuda +// Instead of explicit memory management: +MyStruct* h_data = new MyStruct[n]; +MyStruct* d_data; +cudaMalloc(&d_data, n * sizeof(MyStruct)); +cudaMemcpy(d_data, h_data, n * sizeof(MyStruct), cudaMemcpyHostToDevice); + +// Use unified memory: +MyStruct* data; +cudaMallocManaged(&data, n * sizeof(MyStruct)); +// Now 'data' can be accessed from both host and device code +``` + +**Expected Impact**: Simplified code and potentially improved performance for certain access patterns, though may not be faster in all cases. + +### 9. Evaluate Multi-GPU Parallelism + +**Description**: For large-scale computations, consider distributing work across multiple GPUs. + +**Implementation**: +```cuda +int numGpus; +cudaGetDeviceCount(&numGpus); + +for (int i = 0; i < numGpus; i++) { + cudaSetDevice(i); + // Allocate device memory for this GPU + cudaMalloc(&d_data[i], size / numGpus); + // Launch kernels on this GPU + kernel<<>>(d_data[i], ...); +} + +// Synchronize and gather results +for (int i = 0; i < numGpus; i++) { + cudaSetDevice(i); + cudaDeviceSynchronize(); + // Copy results back +} +``` + +**Expected Impact**: Near-linear speedup with the number of GPUs for compute-bound applications with minimal inter-GPU communication. + +## Conclusion + +By implementing these optimizations, particularly the high-priority ones, this CUDA application could achieve a **30-45% overall performance improvement**. The most significant gains would come from better memory management and increased parallelism through asynchronous operations and CUDA streams. + +The recommendations are designed to be implemented incrementally, allowing for validation of performance improvements at each step. Start with the high-priority optimizations for the best return on implementation effort. +""" + + def _generate_combined_analysis(self): + """Generate a combined analysis from all section analyses""" + if not self.section_analyses: + print("Error: No section analyses available to combine.") + return None + + print("Generating combined analysis...") + + # Create the combined analysis + combined_analysis = "# CUDA Trace Analysis - Combined Report\n\n" + + # Add a table of contents + combined_analysis += "## Table of Contents\n\n" + for section in self.section_analyses.keys(): + section_title = section.replace('_', ' ').title() + combined_analysis += f"- [{section_title}](#{section.lower().replace('_', '-')})\n" + combined_analysis += "\n" + + # Add each section + for section, analysis in self.section_analyses.items(): + section_title = section.replace('_', ' ').title() + combined_analysis += f"## {section_title}\n\n" + + # Remove the first heading from the section analysis (if it exists) + # to avoid duplicate headings + section_content = analysis + if section_content.startswith('#'): + section_content = '\n'.join(section_content.split('\n')[1:]) + + combined_analysis += section_content + "\n\n" + + # Add a timestamp + combined_analysis += f"\n\n---\n\nGenerated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + + # Save the combined analysis + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.output_dir, f"llm_analysis_combined_{timestamp}.md") + with open(output_path, "w") as f: + f.write(combined_analysis) + + print(f"Combined analysis saved to {output_path}") + + return combined_analysis + + def _generate_html_report(self, tables, encoded_images): + """Generate an HTML report with integrated visualizations and analysis""" + if not self.section_analyses: + print("Error: No section analyses available for HTML report.") + return None + + print("Generating HTML report...") + + # Create a basic HTML template + html_template = """ + + + + + + CUDA Trace Analysis Report + + + + +

CUDA Trace Analysis Report

+ + + + +
+

CUDA Trace Dashboard

+ {% if dashboard_image %} + CUDA Trace Dashboard + {% else %} +

Dashboard visualization not available.

+ {% endif %} +
+ + +
+

Overview

+
+
+ {{ overview_analysis|safe }} +
+
+ {% if dashboard_image %} + CUDA Trace Dashboard + {% endif %} +
+
+
+ + +
+

API Distribution

+
+
+ {{ api_distribution_analysis|safe }} + + {% if api_distribution_table %} + +
+ {{ api_distribution_table|safe }} +
+ {% endif %} +
+
+ {% if api_distribution_image %} + API Distribution + {% endif %} +
+
+
+ + +
+

Memory Operations

+
+
+ {{ memory_operations_analysis|safe }} + + {% if memory_operations_table %} + +
+ {{ memory_operations_table|safe }} +
+ {% endif %} +
+
+ {% if memory_operations_image %} + Memory Operations + {% endif %} +
+
+
+ + +
+

Kernel Launches

+
+
+ {{ kernel_launches_analysis|safe }} + + {% if kernel_distribution_table %} + +
+ {{ kernel_distribution_table|safe }} +
+ {% endif %} + + {% if grid_block_dimensions_table %} + +
+ {{ grid_block_dimensions_table|safe }} +
+ {% endif %} +
+
+ {% if kernel_distribution_image %} + Kernel Distribution + {% endif %} +
+
+
+ + +
+

Performance Bottlenecks

+
+
+ {{ performance_bottlenecks_analysis|safe }} + + {% if performance_bottlenecks_table %} + +
+ {{ performance_bottlenecks_table|safe }} +
+ {% endif %} +
+
+ {% if timeline_image %} + API Call Timeline + {% endif %} +
+
+
+ + +
+

Optimization Recommendations

+ {{ optimization_recommendations_analysis|safe }} +
+ +
+ Generated on: {{ timestamp }} +
+ + + """ + + # Prepare the template context + context = { + 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'dashboard_image': encoded_images.get('dashboard', ''), + 'api_distribution_image': encoded_images.get('api_distribution', ''), + 'memory_operations_image': encoded_images.get('memory_operations', ''), + 'kernel_distribution_image': encoded_images.get('kernel_distribution', ''), + 'timeline_image': encoded_images.get('timeline', ''), + 'overview_analysis': self._markdown_to_html(self.section_analyses.get('overview', 'Analysis not available.')), + 'api_distribution_analysis': self._markdown_to_html(self.section_analyses.get('api_distribution', 'Analysis not available.')), + 'memory_operations_analysis': self._markdown_to_html(self.section_analyses.get('memory_operations', 'Analysis not available.')), + 'kernel_launches_analysis': self._markdown_to_html(self.section_analyses.get('kernel_launches', 'Analysis not available.')), + 'performance_bottlenecks_analysis': self._markdown_to_html(self.section_analyses.get('performance_bottlenecks', 'Analysis not available.')), + 'optimization_recommendations_analysis': self._markdown_to_html(self.section_analyses.get('optimization_recommendations', 'Analysis not available.')), + 'api_distribution_table': tables.get('api_distribution', {}).get('html', ''), + 'memory_operations_table': tables.get('memory_operations', {}).get('html', ''), + 'kernel_distribution_table': tables.get('kernel_distribution', {}).get('html', ''), + 'grid_block_dimensions_table': tables.get('grid_block_dimensions', {}).get('html', ''), + 'performance_bottlenecks_table': tables.get('performance_bottlenecks', {}).get('html', '') + } + + # Render the template + try: + from jinja2 import Template + template = Template(html_template) + html_report = template.render(**context) + except Exception as e: + print(f"Error rendering HTML template: {e}") + # Fallback to simple string replacement + html_report = html_template + for key, value in context.items(): + placeholder = '{{ ' + key + '|safe }}' if '|safe' in html_template else '{{ ' + key + ' }}' + html_report = html_report.replace(placeholder, str(value)) + + # Save the HTML report + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.html_output_dir, f"cuda_trace_analysis_report_{timestamp}.html") + with open(output_path, "w") as f: + f.write(html_report) + + print(f"HTML report saved to {output_path}") + + return output_path + + def _markdown_to_html(self, markdown_text): + """Convert markdown text to HTML""" + try: + import markdown + return markdown.markdown(markdown_text) + except ImportError: + # Simple fallback if markdown module is not available + html = markdown_text + # Convert headers + for i in range(6, 0, -1): + pattern = '#' * i + ' ' + html = html.replace(pattern, f'') + # Close the tag at the end of the line + lines = [] + for line in html.split('\n'): + if line.startswith(f''): + line = line + f'' + lines.append(line) + html = '\n'.join(lines) + + # Convert bold + html = html.replace('**', '') + # Convert italic + html = html.replace('*', '') + # Convert paragraphs + html = '

' + html.replace('\n\n', '

') + '

' + # Convert line breaks + html = html.replace('\n', '
') + + return html + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Enhanced CUDA Trace LLM Analyzer") + parser.add_argument("analysis_dir", help="Path to the analysis results directory") + parser.add_argument("--enhanced_dir", help="Path to enhanced visualizations directory") + parser.add_argument("--llm_mode", choices=["mock", "openai", "local"], default="mock", help="LLM analysis mode") + parser.add_argument("--api_key", help="OpenAI API key (for openai mode)") + parser.add_argument("--model_endpoint", default="http://localhost:8000/v1/chat/completions", help="Local LLM API endpoint (for local mode)") + + args = parser.parse_args() + + analyzer = EnhancedCUDATraceLLMAnalyzer(args.analysis_dir, args.enhanced_dir) + + if args.llm_mode == "openai": + analyzer.analyze_with_openai(args.api_key) + elif args.llm_mode == "local": + analyzer.analyze_with_local_llm(args.model_endpoint) + else: # mock mode + analyzer.generate_mock_analysis() diff --git a/llm-analysis/enhanced_cuda_trace_analysis.py b/llm-analysis/enhanced_cuda_trace_analysis.py new file mode 100644 index 0000000..c1f2f51 --- /dev/null +++ b/llm-analysis/enhanced_cuda_trace_analysis.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +""" +Enhanced CUDA Trace Analysis - Main program integrating all components +""" + +import os +import sys +import json +import argparse +import time +from datetime import datetime +import shutil +import subprocess +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np + +# Import custom modules +from enhanced_cuda_llm_analyzer import EnhancedCUDATraceLLMAnalyzer +from cuda_prompt_templates import CUDAPromptTemplates +from cuda_llm_analysis_tester import CUDALLMAnalysisTester + +def parse_trace_file(trace_file, output_dir): + """Parse the CUDA trace file using the existing parser""" + print(f"Parsing trace file: {trace_file}") + + # Create output directory if it doesn't exist + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Check if the trace file exists + if not os.path.exists(trace_file): + print(f"Error: Trace file not found at {trace_file}") + return None + + # Check if cuda_trace_parser.py exists in the current directory + parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cuda_trace_parser.py") + if not os.path.exists(parser_path): + print(f"Error: cuda_trace_parser.py not found at {parser_path}") + return None + + # Run the parser + parsed_output = os.path.join(output_dir, "parsed_trace.json") + cmd = [sys.executable, parser_path, trace_file, "--output", parsed_output] + + try: + subprocess.run(cmd, check=True) + print(f"Trace file parsed successfully. Output saved to {parsed_output}") + return parsed_output + except subprocess.CalledProcessError as e: + print(f"Error parsing trace file: {e}") + return None + +def analyze_trace_data(parsed_trace, output_dir): + """Analyze the parsed trace data using the existing analyzer""" + print(f"Analyzing trace data from: {parsed_trace}") + + # Create output directory if it doesn't exist + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Check if cuda_trace_analyzer.py exists in the current directory + analyzer_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cuda_trace_analyzer.py") + if not os.path.exists(analyzer_path): + print(f"Error: cuda_trace_analyzer.py not found at {analyzer_path}") + return None + + # Run the analyzer + cmd = [sys.executable, analyzer_path, parsed_trace, "--output_dir", output_dir] + + try: + subprocess.run(cmd, check=True) + print(f"Trace data analyzed successfully. Results saved to {output_dir}") + return output_dir + except subprocess.CalledProcessError as e: + print(f"Error analyzing trace data: {e}") + return None + +def enhance_visualizations(analysis_dir, enhanced_dir): + """Enhance the visualizations using the existing visualization organizer""" + print(f"Enhancing visualizations from: {analysis_dir}") + + # Create enhanced directory if it doesn't exist + if not os.path.exists(enhanced_dir): + os.makedirs(enhanced_dir) + + # Check if cuda_visualization_organizer.py exists in the current directory + organizer_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cuda_visualization_organizer.py") + if not os.path.exists(organizer_path): + print(f"Error: cuda_visualization_organizer.py not found at {organizer_path}") + return None + + # Run the visualization organizer + cmd = [sys.executable, organizer_path, analysis_dir, "--output_dir", enhanced_dir] + + try: + subprocess.run(cmd, check=True) + print(f"Visualizations enhanced successfully. Results saved to {enhanced_dir}") + return enhanced_dir + except subprocess.CalledProcessError as e: + print(f"Error enhancing visualizations: {e}") + return None + +def perform_llm_analysis(analysis_dir, enhanced_dir, llm_mode, api_key=None, model_endpoint=None): + """Perform LLM analysis using the enhanced LLM analyzer""" + print(f"Performing LLM analysis using mode: {llm_mode}") + + # Initialize the enhanced LLM analyzer + analyzer = EnhancedCUDATraceLLMAnalyzer(analysis_dir, enhanced_dir) + + # Perform the analysis + if llm_mode == "openai": + section_analyses = analyzer.analyze_with_openai(api_key) + elif llm_mode == "local": + section_analyses = analyzer.analyze_with_local_llm(model_endpoint) + else: # mock mode + section_analyses = analyzer.generate_mock_analysis() + + if not section_analyses: + print("Error: LLM analysis failed.") + return None + + print(f"LLM analysis completed successfully. Results saved to {os.path.join(analysis_dir, 'llm_analysis')}") + print(f"HTML report saved to {os.path.join(analysis_dir, 'html_report')}") + + return section_analyses + +def test_llm_analysis(analysis_dir, enhanced_dir, test_dir, llm_mode, api_key=None, model_endpoint=None): + """Test the LLM analysis using the testing framework""" + print(f"Testing LLM analysis using mode: {llm_mode}") + + # Initialize the LLM analysis tester + tester = CUDALLMAnalysisTester(analysis_dir, enhanced_dir, test_dir) + + # Perform the tests + if llm_mode == "openai": + test_results, quality_metrics = tester.test_openai_analysis(api_key) + elif llm_mode == "local": + test_results, quality_metrics = tester.test_local_llm_analysis(model_endpoint) + else: # mock mode + test_results = tester.test_mock_analysis() + quality_metrics = None + + if not test_results: + print("Error: LLM analysis testing failed.") + return None + + print(f"LLM analysis testing completed successfully. Results saved to {test_dir}") + + return test_results, quality_metrics + +def create_final_report(analysis_dir, enhanced_dir, llm_analysis_dir, output_file): + """Create a final report combining all analysis results""" + print(f"Creating final report: {output_file}") + + # Load analysis results + analysis_results_path = os.path.join(analysis_dir, "analysis_results.json") + if os.path.exists(analysis_results_path): + with open(analysis_results_path, 'r') as f: + analysis_results = json.load(f) + else: + print(f"Warning: Analysis results not found at {analysis_results_path}") + analysis_results = {} + + # Load summary + summary_path = os.path.join(analysis_dir, "analysis_summary.json") + if os.path.exists(summary_path): + with open(summary_path, 'r') as f: + summary = json.load(f) + else: + print(f"Warning: Analysis summary not found at {summary_path}") + summary = {} + + # Find the combined LLM analysis + combined_analysis_path = None + for filename in os.listdir(llm_analysis_dir): + if filename.startswith("llm_analysis_combined_"): + combined_analysis_path = os.path.join(llm_analysis_dir, filename) + break + + if not combined_analysis_path: + print("Warning: Combined LLM analysis not found") + combined_analysis = "LLM analysis not available." + else: + with open(combined_analysis_path, 'r') as f: + combined_analysis = f.read() + + # Find the HTML report + html_report_dir = os.path.join(analysis_dir, "html_report") + html_report_path = None + if os.path.exists(html_report_dir): + for filename in os.listdir(html_report_dir): + if filename.startswith("cuda_trace_analysis_report_"): + html_report_path = os.path.join(html_report_dir, filename) + break + + # Create the final report + report = f"""# CUDA Trace Analysis Report + +## Summary + +- Total trace entries: {summary.get('total_trace_entries', 'Unknown')} +- Unique API functions: {summary.get('unique_api_functions', 'Unknown')} +- Unique kernels: {summary.get('unique_kernels', 'Unknown')} +- Trace duration: {summary.get('trace_duration_seconds', 'Unknown')} seconds + +## Analysis Results + +{combined_analysis} + +## Visualizations + +The following visualizations are available in the analysis directory: + +- API Distribution: {os.path.join(analysis_dir, "api_distribution.png")} +- Kernel Name Distribution: {os.path.join(analysis_dir, "kernel_name_distribution.png")} +- Memory Operations: {os.path.join(analysis_dir, "memory_operations.png")} +- API Call Timeline: {os.path.join(analysis_dir, "api_call_timeline.png")} + +Enhanced visualizations are available in the enhanced directory: + +- CUDA Trace Dashboard: {os.path.join(enhanced_dir, "cuda_trace_dashboard.png")} + +## HTML Report + +An interactive HTML report is available at: + +{html_report_path if html_report_path else "HTML report not available."} + +## Generated on + +{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} +""" + + # Write the report to file + with open(output_file, 'w') as f: + f.write(report) + + print(f"Final report created successfully: {output_file}") + + return output_file + +def main(): + parser = argparse.ArgumentParser(description="Enhanced CUDA Trace Analysis") + parser.add_argument("trace_file", help="Path to the CUDA trace file") + parser.add_argument("--output_dir", default="./cuda_analysis_results", help="Output directory for analysis results") + parser.add_argument("--llm_mode", choices=["mock", "openai", "local"], default="mock", help="LLM analysis mode") + parser.add_argument("--api_key", help="OpenAI API key (for openai mode)") + parser.add_argument("--model_endpoint", default="http://localhost:8000/v1/chat/completions", help="Local LLM API endpoint (for local mode)") + parser.add_argument("--skip_parsing", action="store_true", help="Skip trace file parsing (use existing parsed data)") + parser.add_argument("--skip_analysis", action="store_true", help="Skip trace data analysis (use existing analysis results)") + parser.add_argument("--skip_visualization", action="store_true", help="Skip visualization enhancement (use existing enhanced visualizations)") + parser.add_argument("--test_llm", action="store_true", help="Test LLM analysis using the testing framework") + + args = parser.parse_args() + + # Create output directories + output_dir = os.path.abspath(args.output_dir) + analysis_dir = os.path.join(output_dir, "analysis") + enhanced_dir = os.path.join(output_dir, "enhanced") + llm_analysis_dir = os.path.join(analysis_dir, "llm_analysis") + test_dir = os.path.join(output_dir, "test_results") + + for directory in [output_dir, analysis_dir, enhanced_dir, llm_analysis_dir, test_dir]: + if not os.path.exists(directory): + os.makedirs(directory) + + # Step 1: Parse the trace file + if not args.skip_parsing: + parsed_trace = parse_trace_file(args.trace_file, output_dir) + if not parsed_trace: + print("Error: Failed to parse trace file. Exiting.") + return 1 + else: + parsed_trace = os.path.join(output_dir, "parsed_trace.json") + if not os.path.exists(parsed_trace): + print(f"Error: Parsed trace file not found at {parsed_trace}. Please run without --skip_parsing.") + return 1 + print(f"Using existing parsed trace file: {parsed_trace}") + + # Step 2: Analyze the trace data + if not args.skip_analysis: + analysis_results = analyze_trace_data(parsed_trace, analysis_dir) + if not analysis_results: + print("Error: Failed to analyze trace data. Exiting.") + return 1 + else: + if not os.path.exists(os.path.join(analysis_dir, "analysis_results.json")): + print(f"Error: Analysis results not found in {analysis_dir}. Please run without --skip_analysis.") + return 1 + print(f"Using existing analysis results in: {analysis_dir}") + + # Step 3: Enhance the visualizations + if not args.skip_visualization: + enhanced_results = enhance_visualizations(analysis_dir, enhanced_dir) + if not enhanced_results: + print("Error: Failed to enhance visualizations. Exiting.") + return 1 + else: + if not os.path.exists(os.path.join(enhanced_dir, "cuda_trace_dashboard.png")): + print(f"Error: Enhanced visualizations not found in {enhanced_dir}. Please run without --skip_visualization.") + return 1 + print(f"Using existing enhanced visualizations in: {enhanced_dir}") + + # Step 4: Perform LLM analysis + section_analyses = perform_llm_analysis(analysis_dir, enhanced_dir, args.llm_mode, args.api_key, args.model_endpoint) + if not section_analyses: + print("Error: Failed to perform LLM analysis. Exiting.") + return 1 + + # Step 5: Test LLM analysis (optional) + if args.test_llm: + test_results = test_llm_analysis(analysis_dir, enhanced_dir, test_dir, args.llm_mode, args.api_key, args.model_endpoint) + if not test_results: + print("Warning: LLM analysis testing failed.") + + # Step 6: Create final report + final_report = create_final_report(analysis_dir, enhanced_dir, llm_analysis_dir, os.path.join(output_dir, "final_report.md")) + if not final_report: + print("Error: Failed to create final report. Exiting.") + return 1 + + print("\nAnalysis completed successfully!") + print(f"Results saved to: {output_dir}") + print(f"Final report: {final_report}") + print(f"HTML report: {os.path.join(analysis_dir, 'html_report')}") + + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/llm-analysis/requirements.txt b/llm-analysis/requirements.txt new file mode 100644 index 0000000..717b7b1 --- /dev/null +++ b/llm-analysis/requirements.txt @@ -0,0 +1,10 @@ +pandas +matplotlib +seaborn +numpy +pillow +requests +openai>=1.0.0 +jinja2 +markdown +tabulate diff --git a/llm-analysis/trace.out b/llm-analysis/trace.out new file mode 100644 index 0000000..3bc7c64 --- /dev/null +++ b/llm-analysis/trace.out @@ -0,0 +1,7546 @@ +Compiling test_cuda_api_multi_gpu.cu... +Compilation successful. +Starting test_cuda_api_multi_gpu... +CUDA process running with PID: 3359096 +Running gpuevent_snoop for 30 seconds... +Found Symbol cudaLaunchKernel at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaLaunchKernel at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x79d40 +Found Symbol cudaLaunchKernel at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68630 +Found Symbol cudaLaunchCooperativeKernel at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x7a2e0 +Found Symbol cudaLaunchCooperativeKernel at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68900 +Found Symbol cudaGraphLaunch at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x7be20 +Found Symbol cudaGraphLaunch at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x69c90 +Found Symbol cudaMalloc at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMalloc at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x577b0 +Found Symbol cudaMalloc at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4dc80 +Found Symbol cudaFree at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaFree at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x58050 +Found Symbol cudaFree at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4e3c0 +Found Symbol cudaMemcpy at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMemcpy at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x73df0 +Found Symbol cudaMemcpy at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x639e0 +Found Symbol cudaMemcpyAsync at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMemcpyAsync at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x766f0 +Found Symbol cudaMemcpyAsync at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x659b0 +Found Symbol cudaStreamCreate at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamCreate at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x4f640 +Found Symbol cudaStreamCreate at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x47f10 +Found Symbol cudaStreamDestroy at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamDestroy at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x50ba0 +Found Symbol cudaStreamDestroy at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x48e40 +Found Symbol cudaStreamSynchronize at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x79b50 +Found Symbol cudaStreamSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68480 +Found Symbol cudaEventRecord at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x790c0 +Found Symbol cudaEventRecord at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x67b40 +Found Symbol cudaEventSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x535e0 +Found Symbol cudaEventSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4ae60 +Found Symbol cudaEventElapsedTime at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x53980 +Found Symbol cudaEventElapsedTime at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4b1a0 +Found Symbol cudaDeviceSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x4a310 +Found Symbol cudaDeviceSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x43aa0 +Started profiling at Sat Apr 19 07:24:38 2025 +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +Done Profiling: exceeded duration of 30s. +Stopped profiling at Sat Apr 19 07:25:09 2025 +Stopping CUDA program... +Tracing completed. diff --git a/llm-sample-results/analysis/analysis_results.json b/llm-sample-results/analysis/analysis_results.json new file mode 100644 index 0000000..d339125 --- /dev/null +++ b/llm-sample-results/analysis/analysis_results.json @@ -0,0 +1,204 @@ +{ + "api_distribution": [ + { + "API Function": "cudaLaunchKernel", + "Count": 300 + }, + { + "API Function": "cudaMemcpy", + "Count": 120 + }, + { + "API Function": "cudaMemcpyAsync", + "Count": 60 + }, + { + "API Function": "cudaStreamSynchronize", + "Count": 60 + }, + { + "API Function": "cudaMalloc", + "Count": 60 + }, + { + "API Function": "cudaFree", + "Count": 60 + } + ], + "kernel_name_distribution": [ + { + "Kernel Name": "vector_add", + "Count": 300 + } + ], + "grid_block_dimensions": [ + { + "Value": 4096, + "Count": 300, + "Dimension": "grid_x" + }, + { + "Value": 1, + "Count": 300, + "Dimension": "grid_y" + }, + { + "Value": 1, + "Count": 300, + "Dimension": "grid_z" + }, + { + "Value": 256, + "Count": 300, + "Dimension": "block_x" + }, + { + "Value": 1, + "Count": 300, + "Dimension": "block_y" + }, + { + "Value": 1, + "Count": 300, + "Dimension": "block_z" + } + ], + "temporal_distribution": [ + { + "Relative Time (s)": 0.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 2.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 4.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 6.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 8.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 10.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 12.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 14.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 16.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 18.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 20.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 22.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 24.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 26.0, + "API Call Count": 44 + }, + { + "Relative Time (s)": 28.0, + "API Call Count": 44 + } + ], + "memory_operations": [ + { + "Memory Operation": "cudaMemcpy", + "Count": 120 + }, + { + "Memory Operation": "cudaMemcpyAsync", + "Count": 60 + }, + { + "Memory Operation": "cudaMalloc", + "Count": 60 + }, + { + "Memory Operation": "cudaFree", + "Count": 60 + } + ], + "call_site_distribution": [ + { + "Call Site": "__libc_start_main", + "Count": 360 + }, + { + "Call Site": "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "Count": 300 + } + ], + "call_site_api_relationship": [ + { + "Call Site": "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "CUDA API Function": "cudaLaunchKernel", + "Count": 300 + }, + { + "Call Site": "__libc_start_main", + "CUDA API Function": "cudaMemcpy", + "Count": 120 + }, + { + "Call Site": "__libc_start_main", + "CUDA API Function": "cudaFree", + "Count": 60 + }, + { + "Call Site": "__libc_start_main", + "CUDA API Function": "cudaMalloc", + "Count": 60 + }, + { + "Call Site": "__libc_start_main", + "CUDA API Function": "cudaMemcpyAsync", + "Count": 60 + }, + { + "Call Site": "__libc_start_main", + "CUDA API Function": "cudaStreamSynchronize", + "Count": 60 + } + ], + "summary": { + "total_trace_entries": 660, + "unique_api_functions": 6, + "top_api_functions": { + "cudaLaunchKernel": 300, + "cudaMemcpy": 120, + "cudaMemcpyAsync": 60, + "cudaStreamSynchronize": 60, + "cudaMalloc": 60 + }, + "unique_kernels": 1, + "kernel_names": [ + "vector_add" + ], + "trace_duration_seconds": 28.0 + } +} \ No newline at end of file diff --git a/llm-sample-results/analysis/analysis_summary.json b/llm-sample-results/analysis/analysis_summary.json new file mode 100644 index 0000000..397a51c --- /dev/null +++ b/llm-sample-results/analysis/analysis_summary.json @@ -0,0 +1,16 @@ +{ + "total_trace_entries": 660, + "unique_api_functions": 6, + "top_api_functions": { + "cudaLaunchKernel": 300, + "cudaMemcpy": 120, + "cudaMemcpyAsync": 60, + "cudaStreamSynchronize": 60, + "cudaMalloc": 60 + }, + "unique_kernels": 1, + "kernel_names": [ + "vector_add" + ], + "trace_duration_seconds": 28.0 +} \ No newline at end of file diff --git a/llm-sample-results/analysis/api_call_timeline.png b/llm-sample-results/analysis/api_call_timeline.png new file mode 100644 index 0000000..0d80516 Binary files /dev/null and b/llm-sample-results/analysis/api_call_timeline.png differ diff --git a/llm-sample-results/analysis/api_distribution.png b/llm-sample-results/analysis/api_distribution.png new file mode 100644 index 0000000..9f1e5f8 Binary files /dev/null and b/llm-sample-results/analysis/api_distribution.png differ diff --git a/llm-sample-results/analysis/api_time_heatmap.png b/llm-sample-results/analysis/api_time_heatmap.png new file mode 100644 index 0000000..0a5b19e Binary files /dev/null and b/llm-sample-results/analysis/api_time_heatmap.png differ diff --git a/llm-sample-results/analysis/block_dimensions.png b/llm-sample-results/analysis/block_dimensions.png new file mode 100644 index 0000000..f20cf2c Binary files /dev/null and b/llm-sample-results/analysis/block_dimensions.png differ diff --git a/llm-sample-results/analysis/call_site_api_heatmap.png b/llm-sample-results/analysis/call_site_api_heatmap.png new file mode 100644 index 0000000..e869b70 Binary files /dev/null and b/llm-sample-results/analysis/call_site_api_heatmap.png differ diff --git a/llm-sample-results/analysis/call_site_distribution.png b/llm-sample-results/analysis/call_site_distribution.png new file mode 100644 index 0000000..891a586 Binary files /dev/null and b/llm-sample-results/analysis/call_site_distribution.png differ diff --git a/llm-sample-results/analysis/grid_dimensions.png b/llm-sample-results/analysis/grid_dimensions.png new file mode 100644 index 0000000..cc36b01 Binary files /dev/null and b/llm-sample-results/analysis/grid_dimensions.png differ diff --git a/llm-sample-results/analysis/html_report/cuda_trace_analysis_report_20250423_234928.html b/llm-sample-results/analysis/html_report/cuda_trace_analysis_report_20250423_234928.html new file mode 100644 index 0000000..1c40f8e --- /dev/null +++ b/llm-sample-results/analysis/html_report/cuda_trace_analysis_report_20250423_234928.html @@ -0,0 +1,686 @@ + + + + + + + CUDA Trace Analysis Report + + + + +

CUDA Trace Analysis Report

+ + + + +
+

CUDA Trace Dashboard

+ + CUDA Trace Dashboard + +
+ + +
+

Overview

+
+
+

Executive Summary of CUDA Trace Data

+

1. Main Characteristics of the Application

+

The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations.

+

2. Significant Patterns Observed

+
    +
  • API Distribution: The API call distribution shows that cudaLaunchKernel is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity.
  • +
  • Memory Operations: Frequent cudaMemcpy calls suggest significant host-device memory transfers.
  • +
  • Temporal Analysis: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks.
  • +
+

3. Key Performance Considerations

+
    +
  • Synchronization: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points.
  • +
  • Memory-Launch Ratio: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation.
  • +
  • Launch Configuration: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance.
  • +
+

4. Assessment of CUDA Implementation Quality

+

Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads.

+

In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations.

+
+
+ + CUDA Trace Dashboard + +
+
+
+ + +
+

API Distribution

+
+
+

Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data:

+

1. Most Frequently Used CUDA API Functions

+
    +
  • +

    cudaLaunchKernel (45.45%): This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations.

    +
  • +
  • +

    cudaMemcpy (18.18%): This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized.

    +
  • +
+

2. Balance Between Different Types of Operations

+
    +
  • +

    Compute (cudaLaunchKernel): Dominates the API distribution, showing the application’s reliance on GPU computation.

    +
  • +
  • +

    Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree):

    +
  • +
  • cudaMemcpy and cudaMemcpyAsync (27.27% combined): Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management.
  • +
  • +

    cudaMalloc and cudaFree (18.18% combined): Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic.

    +
  • +
  • +

    Synchronization (cudaStreamSynchronize - 9.09%): This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance.

    +
  • +
+

3. Unusual or Inefficient API Usage Patterns

+
    +
  • +

    Frequent cudaMalloc and cudaFree: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance.

    +
  • +
  • +

    High Usage of cudaMemcpy: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation.

    +
  • +
+

4. Recommendations for Optimizing API Usage

+
    +
  • Optimize Memory Transfers:
  • +
  • Use asynchronous memory copies (cudaMemcpyAsync) more extensively to overlap data transfer and kernel execution.
  • +
  • +

    Batch data transfers or increase data granularity to reduce the number of transfer operations.

    +
  • +
  • +

    Improve Memory Management:

    +
  • +
  • Reduce frequent calls to cudaMalloc and cudaFree by reusing allocated memory wherever possible.
  • +
  • +

    Consider using memory pools or pre-allocating buffer spaces.

    +
  • +
  • +

    Kernel Optimization:

    +
  • +
  • Ensure that there is no significant idle time between kernel executions.
  • +
  • +

    Profile kernels to find any computation bottlenecks.

    +
  • +
  • +

    Reduce Synchronization Overhead:

    +
  • +
  • Minimize the use of cudaStreamSynchronize by managing dependencies and using streams effectively to overlap operations.
  • +
+

By addressing these areas, the application can improve its overall execution efficiency on the GPU.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCountPercentage
cudaLaunchKernel30045.45
cudaMemcpy12018.18
cudaMemcpyAsync609.09
cudaStreamSynchronize609.09
cudaMalloc609.09
cudaFree609.09
+
+ +
+
+ + API Distribution + +
+
+
+ + +
+

Memory Operations

+
+
+

Analysis of CUDA Memory Operations

+

1. Assessment of Memory Transfer Patterns and Their Efficiency

+

The data suggests that cudaMemcpy operations account for 40% of memory operations, while cudaMemcpyAsync comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete.

+

Efficiency Analysis:

+
    +
  • Synchronous Transfers (cudaMemcpy): Generally slower due to blocking behavior.
  • +
  • Asynchronous Transfers (cudaMemcpyAsync): More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer.
  • +
+

2. Analysis of the Balance Between Different Types of Memory Operations

+

All four types of operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, and cudaFree) are represented, but there is a notable imbalance with a high proportion of cudaMemcpy. Allocation and deallocation (cudaMalloc and cudaFree) operations are equally distributed at 20% each.

+

The data skew towards cudaMemcpy might suggest missed opportunities for optimization using asynchronous transfers.

+

3. Identification of Potential Memory-Related Bottlenecks

+
    +
  • Potential Bottleneck: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations.
  • +
  • Allocation and Deallocation: Frequent and possibly unnecessary calls to cudaMalloc and cudaFree can also cause performance hits. These should be minimized and reused when possible.
  • +
+

4. Recommendations for Optimizing Memory Usage and Transfers

+
    +
  1. +

    Increase Asynchronous Transfers: Consider increasing the use of cudaMemcpyAsync to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU.

    +
  2. +
  3. +

    Optimize Memory Allocation:

    +
  4. +
  5. Reuse memory allocations wherever possible instead of frequent malloc and free calls.
  6. +
  7. +

    Consider using memory pools to manage small allocations which can reduce overhead.

    +
  8. +
  9. +

    Streamlining the Memory Transfer:

    +
  10. +
  11. Batch smaller data transfers into fewer, larger transfers to reduce the number of cudaMemcpy calls.
  12. +
  13. +

    Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers.

    +
  14. +
  15. +

    Profile and Monitor:

    +
  16. +
  17. Regularly profile the application to identify specific points of inefficiency.
  18. +
  19. Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps.
  20. +
+

By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory OperationCountPercentage
cudaMemcpy12040.0
cudaMemcpyAsync6020.0
cudaMalloc6020.0
cudaFree6020.0
+
+ +
+
+ + Memory Operations + +
+
+
+ + +
+

Kernel Launches

+
+
+

Analysis of CUDA Kernel Launch Patterns

+

1. Assessment of Kernel Launch Patterns and Their Implications for Performance

+

The kernel launch data shows that there is only one type of kernel, vector_add, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization.

+

The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities.

+

2. Analysis of Grid and Block Dimensions

+

Grid Dimensions: +- grid_x is consistently set at 4096, while grid_y and grid_z have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing.

+

Block Dimensions: +- block_x is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- block_y and block_z are set to 1, reinforcing that the computation is handled in a one-dimensional array.

+

3. Evaluation of Kernel Occupancy and Efficiency

+

Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory.

+

4. Recommendations for Optimizing Kernel Launch Configurations

+
    +
  • +

    Diversify Workload: If possible, consider diversifying computational tasks to balance load and better utilize GPU resources.

    +
  • +
  • +

    Experiment with Block Size: Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures.

    +
  • +
  • +

    Evaluate GPU Occupancy: Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal.

    +
  • +
  • +

    Memory Coalescing: Ensure that memory accesses are coalesced for vector_add, which can significantly impact performance.

    +
  • +
  • +

    Consider Multi-Stream Execution: If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer.

    +
  • +
+

By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware.

+ + + +
+ + + + + + + + + + + + + + + +
Kernel NameCountPercentage
vector_add300100.0
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ValueDimension12564096
0block_x0.0300.00.0
1block_y300.00.00.0
2block_z300.00.00.0
3grid_x0.00.0300.0
4grid_y300.00.00.0
5grid_z300.00.00.0
+
+ +
+
+ + Kernel Distribution + +
+
+
+ + +
+

Performance Bottlenecks

+
+
+

To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context:

+

1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance

+

Memory Transfer Overhead

+
    +
  • +

    Explanation: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times.

    +
  • +
  • +

    Impact: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation.

    +
  • +
+

Excessive Synchronization

+
    +
  • +

    Explanation: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points.

    +
  • +
  • +

    Impact: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA.

    +
  • +
+

2. Root Cause Analysis for Each Bottleneck

+

Memory Transfer Overhead

+
    +
  • Root Causes:
  • +
  • Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations.
  • +
  • Frequent small transfers instead of fewer batched transactions.
  • +
  • Inefficient data management strategies causing frequent data transfers between the host and the GPU.
  • +
+

Excessive Synchronization

+
    +
  • Root Causes:
  • +
  • Over-reliance on synchronization functions like cudaDeviceSynchronize(), resulting in unnecessary wait times.
  • +
  • Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently.
  • +
  • Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU.
  • +
+

3. Prioritized Recommendations for Addressing Each Bottleneck

+

Memory Transfer Overhead

+
    +
  1. Use Pinned Memory:
  2. +
  3. +

    Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU.

    +
  4. +
  5. +

    Batch Data Transfers:

    +
  6. +
  7. +

    Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations.

    +
  8. +
  9. +

    Retain Data on GPU:

    +
  10. +
  11. Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device.
  12. +
+

Excessive Synchronization

+
    +
  1. Optimize Use of CUDA Streams:
  2. +
  3. +

    Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers.

    +
  4. +
  5. +

    Reduce Synchronization Points:

    +
  6. +
  7. +

    Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance.

    +
  8. +
  9. +

    Algorithm Redesign:

    +
  10. +
  11. Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization.
  12. +
+

4. Potential Performance Gains from Implementing the Recommendations

+
    +
  • Expected Gains from Reducing Memory Transfer Overhead:
  • +
  • +

    By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency.

    +
  • +
  • +

    Expected Gains from Addressing Excessive Synchronization:

    +
  • +
  • Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources.
  • +
+

Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + +
BottleneckMetricSeverityRecommendation
Memory Transfer Overhead27.27% of operationsMediumUse pinned memory, batch transfers, or keep data on GPU longer
Excessive Synchronization60 sync operations (9.09%)MediumReduce synchronization points, use multiple streams for parallelism
+
+ +
+
+ + API Call Timeline + +
+
+
+ + +
+

Optimization Recommendations

+

To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact:

+

1. Code-Level Optimizations

+

a. Kernel Execution

+

Issue: Kernel execution time is high due to inefficient code. +- Recommendation: Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- Example: Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency.

+

b. Memory Access Patterns

+

Issue: Non-coalesced memory accesses leading to increased latency. +- Recommendation: Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- Example: If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access.

+

c. Instruction Throughput

+

Issue: Low instruction throughput. +- Recommendation: Utilize intrinsic functions specific to CUDA like __sinf, __expf for trigonometric or exponential functions to increase math operation throughput. +- Example: Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable.

+

2. Architectural Changes

+

a. Grid and Block Configuration

+

Issue: Suboptimal grid and block configuration leading to low occupancy. +- Recommendation: Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- Example: If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code.

+

b. Memory Hierarchy Utilization

+

Issue: Underutilization of shared memory and cache. +- Recommendation: Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- Example: For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality.

+

3. Alternative Approaches or CUDA Features

+

a. Asynchronous Execution

+

Issue: Sequential execution of memory transfers and kernel executions. +- Recommendation: Leverage CUDA streams to overlap computation with memory transfers. Use cudaMemcpyAsync to perform asynchronous data transfers between host and device. +- Example: Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations.

+

b. Unified Memory

+

Issue: Complex data management between host and device. +- Recommendation: Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- Example: Using cudaMallocManaged allows the system to automatically manage memory residency, although this may not provide the best performance in every case.

+

4. Prioritization of Recommendations

+
    +
  1. Memory Access Patterns: Ensuring coalesced access usually provides immediate and significant benefits.
  2. +
  3. Grid and Block Configuration: Properly configuring these can significantly impact occupancy and thus performance.
  4. +
  5. Kernel Execution: Reducing divergence and using efficient math operations can yield noticeable improvements.
  6. +
  7. Asynchronous Execution: Overlapping data transfer and execution increases pipeline efficiency.
  8. +
  9. Unified Memory: Provides ease of use, though hardware limitations might dictate otherwise.
  10. +
+

These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice.

+
+ +
+ Generated on: 2025-04-23 23:49:28 +
+ + + \ No newline at end of file diff --git a/llm-sample-results/analysis/kernel_name_distribution.png b/llm-sample-results/analysis/kernel_name_distribution.png new file mode 100644 index 0000000..dad7285 Binary files /dev/null and b/llm-sample-results/analysis/kernel_name_distribution.png differ diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_api_distribution_20250423_234826.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_api_distribution_20250423_234826.md new file mode 100644 index 0000000..1b71fc7 --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_api_distribution_20250423_234826.md @@ -0,0 +1,42 @@ +Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data: + +### 1. Most Frequently Used CUDA API Functions + +- **cudaLaunchKernel (45.45%)**: This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations. + +- **cudaMemcpy (18.18%)**: This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized. + +### 2. Balance Between Different Types of Operations + +- **Compute (cudaLaunchKernel)**: Dominates the API distribution, showing the application’s reliance on GPU computation. + +- **Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree)**: + - **cudaMemcpy and cudaMemcpyAsync (27.27% combined)**: Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management. + - **cudaMalloc and cudaFree (18.18% combined)**: Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic. + +- **Synchronization (cudaStreamSynchronize - 9.09%)**: This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance. + +### 3. Unusual or Inefficient API Usage Patterns + +- **Frequent cudaMalloc and cudaFree**: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance. + +- **High Usage of cudaMemcpy**: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation. + +### 4. Recommendations for Optimizing API Usage + +- **Optimize Memory Transfers**: + - Use asynchronous memory copies (`cudaMemcpyAsync`) more extensively to overlap data transfer and kernel execution. + - Batch data transfers or increase data granularity to reduce the number of transfer operations. + +- **Improve Memory Management**: + - Reduce frequent calls to `cudaMalloc` and `cudaFree` by reusing allocated memory wherever possible. + - Consider using memory pools or pre-allocating buffer spaces. + +- **Kernel Optimization**: + - Ensure that there is no significant idle time between kernel executions. + - Profile kernels to find any computation bottlenecks. + +- **Reduce Synchronization Overhead**: + - Minimize the use of `cudaStreamSynchronize` by managing dependencies and using streams effectively to overlap operations. + +By addressing these areas, the application can improve its overall execution efficiency on the GPU. \ No newline at end of file diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_combined_20250423_234928.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_combined_20250423_234928.md new file mode 100644 index 0000000..1b92ab7 --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_combined_20250423_234928.md @@ -0,0 +1,294 @@ +# CUDA Trace Analysis - Combined Report + +## Table of Contents + +- [Overview](#overview) +- [Api Distribution](#api-distribution) +- [Memory Operations](#memory-operations) +- [Kernel Launches](#kernel-launches) +- [Performance Bottlenecks](#performance-bottlenecks) +- [Optimization Recommendations](#optimization-recommendations) + +## Overview + + +### 1. Main Characteristics of the Application + +The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations. + +### 2. Significant Patterns Observed + +- **API Distribution**: The API call distribution shows that `cudaLaunchKernel` is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity. +- **Memory Operations**: Frequent `cudaMemcpy` calls suggest significant host-device memory transfers. +- **Temporal Analysis**: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks. + +### 3. Key Performance Considerations + +- **Synchronization**: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points. +- **Memory-Launch Ratio**: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation. +- **Launch Configuration**: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance. + +### 4. Assessment of CUDA Implementation Quality + +Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads. + +In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations. + +## Api Distribution + +Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data: + +### 1. Most Frequently Used CUDA API Functions + +- **cudaLaunchKernel (45.45%)**: This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations. + +- **cudaMemcpy (18.18%)**: This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized. + +### 2. Balance Between Different Types of Operations + +- **Compute (cudaLaunchKernel)**: Dominates the API distribution, showing the application’s reliance on GPU computation. + +- **Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree)**: + - **cudaMemcpy and cudaMemcpyAsync (27.27% combined)**: Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management. + - **cudaMalloc and cudaFree (18.18% combined)**: Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic. + +- **Synchronization (cudaStreamSynchronize - 9.09%)**: This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance. + +### 3. Unusual or Inefficient API Usage Patterns + +- **Frequent cudaMalloc and cudaFree**: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance. + +- **High Usage of cudaMemcpy**: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation. + +### 4. Recommendations for Optimizing API Usage + +- **Optimize Memory Transfers**: + - Use asynchronous memory copies (`cudaMemcpyAsync`) more extensively to overlap data transfer and kernel execution. + - Batch data transfers or increase data granularity to reduce the number of transfer operations. + +- **Improve Memory Management**: + - Reduce frequent calls to `cudaMalloc` and `cudaFree` by reusing allocated memory wherever possible. + - Consider using memory pools or pre-allocating buffer spaces. + +- **Kernel Optimization**: + - Ensure that there is no significant idle time between kernel executions. + - Profile kernels to find any computation bottlenecks. + +- **Reduce Synchronization Overhead**: + - Minimize the use of `cudaStreamSynchronize` by managing dependencies and using streams effectively to overlap operations. + +By addressing these areas, the application can improve its overall execution efficiency on the GPU. + +## Memory Operations + + +### 1. Assessment of Memory Transfer Patterns and Their Efficiency + +The data suggests that `cudaMemcpy` operations account for 40% of memory operations, while `cudaMemcpyAsync` comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete. + +#### Efficiency Analysis: +- **Synchronous Transfers (`cudaMemcpy`)**: Generally slower due to blocking behavior. +- **Asynchronous Transfers (`cudaMemcpyAsync`)**: More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer. + +### 2. Analysis of the Balance Between Different Types of Memory Operations + +All four types of operations (`cudaMemcpy`, `cudaMemcpyAsync`, `cudaMalloc`, and `cudaFree`) are represented, but there is a notable imbalance with a high proportion of `cudaMemcpy`. Allocation and deallocation (`cudaMalloc` and `cudaFree`) operations are equally distributed at 20% each. + +The data skew towards `cudaMemcpy` might suggest missed opportunities for optimization using asynchronous transfers. + +### 3. Identification of Potential Memory-Related Bottlenecks + +- **Potential Bottleneck**: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations. +- **Allocation and Deallocation**: Frequent and possibly unnecessary calls to `cudaMalloc` and `cudaFree` can also cause performance hits. These should be minimized and reused when possible. + +### 4. Recommendations for Optimizing Memory Usage and Transfers + +1. **Increase Asynchronous Transfers**: Consider increasing the use of `cudaMemcpyAsync` to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU. + +2. **Optimize Memory Allocation**: + - Reuse memory allocations wherever possible instead of frequent malloc and free calls. + - Consider using memory pools to manage small allocations which can reduce overhead. + +3. **Streamlining the Memory Transfer**: + - Batch smaller data transfers into fewer, larger transfers to reduce the number of `cudaMemcpy` calls. + - Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers. + +4. **Profile and Monitor**: + - Regularly profile the application to identify specific points of inefficiency. + - Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps. + +By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications. + +## Kernel Launches + + +### 1. Assessment of Kernel Launch Patterns and Their Implications for Performance + +The kernel launch data shows that there is only one type of kernel, `vector_add`, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization. + +The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities. + +### 2. Analysis of Grid and Block Dimensions + +**Grid Dimensions:** +- `grid_x` is consistently set at 4096, while `grid_y` and `grid_z` have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing. + +**Block Dimensions:** +- `block_x` is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- `block_y` and `block_z` are set to 1, reinforcing that the computation is handled in a one-dimensional array. + +### 3. Evaluation of Kernel Occupancy and Efficiency + +Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory. + +### 4. Recommendations for Optimizing Kernel Launch Configurations + +- **Diversify Workload:** If possible, consider diversifying computational tasks to balance load and better utilize GPU resources. + +- **Experiment with Block Size:** Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures. + +- **Evaluate GPU Occupancy:** Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal. + +- **Memory Coalescing:** Ensure that memory accesses are coalesced for `vector_add`, which can significantly impact performance. + +- **Consider Multi-Stream Execution:** If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer. + +By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware. + +## Performance Bottlenecks + +To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context: + +### 1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance + +#### Memory Transfer Overhead + +- **Explanation**: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times. + +- **Impact**: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation. + +#### Excessive Synchronization + +- **Explanation**: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points. + +- **Impact**: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA. + +### 2. Root Cause Analysis for Each Bottleneck + +#### Memory Transfer Overhead + +- **Root Causes**: + - Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations. + - Frequent small transfers instead of fewer batched transactions. + - Inefficient data management strategies causing frequent data transfers between the host and the GPU. + +#### Excessive Synchronization + +- **Root Causes**: + - Over-reliance on synchronization functions like `cudaDeviceSynchronize()`, resulting in unnecessary wait times. + - Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently. + - Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU. + +### 3. Prioritized Recommendations for Addressing Each Bottleneck + +#### Memory Transfer Overhead + +1. **Use Pinned Memory**: + - Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU. + +2. **Batch Data Transfers**: + - Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations. + +3. **Retain Data on GPU**: + - Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device. + +#### Excessive Synchronization + +1. **Optimize Use of CUDA Streams**: + - Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers. + +2. **Reduce Synchronization Points**: + - Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance. + +3. **Algorithm Redesign**: + - Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization. + +### 4. Potential Performance Gains from Implementing the Recommendations + +- **Expected Gains from Reducing Memory Transfer Overhead**: + - By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency. + +- **Expected Gains from Addressing Excessive Synchronization**: + - Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources. + +Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing. + +## Optimization Recommendations + +To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact: + +### 1. Code-Level Optimizations + +#### a. Kernel Execution + +**Issue:** Kernel execution time is high due to inefficient code. +- **Recommendation:** Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- **Example:** Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency. + +#### b. Memory Access Patterns + +**Issue:** Non-coalesced memory accesses leading to increased latency. +- **Recommendation:** Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- **Example:** If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access. + +#### c. Instruction Throughput + +**Issue:** Low instruction throughput. +- **Recommendation:** Utilize intrinsic functions specific to CUDA like `__sinf`, `__expf` for trigonometric or exponential functions to increase math operation throughput. +- **Example:** Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable. + +### 2. Architectural Changes + +#### a. Grid and Block Configuration + +**Issue:** Suboptimal grid and block configuration leading to low occupancy. +- **Recommendation:** Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- **Example:** If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code. + +#### b. Memory Hierarchy Utilization + +**Issue:** Underutilization of shared memory and cache. +- **Recommendation:** Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- **Example:** For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality. + +### 3. Alternative Approaches or CUDA Features + +#### a. Asynchronous Execution + +**Issue:** Sequential execution of memory transfers and kernel executions. +- **Recommendation:** Leverage CUDA streams to overlap computation with memory transfers. Use `cudaMemcpyAsync` to perform asynchronous data transfers between host and device. +- **Example:** Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations. + +#### b. Unified Memory + +**Issue:** Complex data management between host and device. +- **Recommendation:** Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- **Example:** Using `cudaMallocManaged` allows the system to automatically manage memory residency, although this may not provide the best performance in every case. + +### 4. Prioritization of Recommendations + +1. **Memory Access Patterns:** Ensuring coalesced access usually provides immediate and significant benefits. +2. **Grid and Block Configuration:** Properly configuring these can significantly impact occupancy and thus performance. +3. **Kernel Execution:** Reducing divergence and using efficient math operations can yield noticeable improvements. +4. **Asynchronous Execution:** Overlapping data transfer and execution increases pipeline efficiency. +5. **Unified Memory:** Provides ease of use, though hardware limitations might dictate otherwise. + +These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice. + + + +--- + +Generated on: 2025-04-23 23:49:28 diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_kernel_launches_20250423_234853.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_kernel_launches_20250423_234853.md new file mode 100644 index 0000000..9b7f1d7 --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_kernel_launches_20250423_234853.md @@ -0,0 +1,36 @@ +## Analysis of CUDA Kernel Launch Patterns + +### 1. Assessment of Kernel Launch Patterns and Their Implications for Performance + +The kernel launch data shows that there is only one type of kernel, `vector_add`, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization. + +The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities. + +### 2. Analysis of Grid and Block Dimensions + +**Grid Dimensions:** +- `grid_x` is consistently set at 4096, while `grid_y` and `grid_z` have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing. + +**Block Dimensions:** +- `block_x` is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- `block_y` and `block_z` are set to 1, reinforcing that the computation is handled in a one-dimensional array. + +### 3. Evaluation of Kernel Occupancy and Efficiency + +Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory. + +### 4. Recommendations for Optimizing Kernel Launch Configurations + +- **Diversify Workload:** If possible, consider diversifying computational tasks to balance load and better utilize GPU resources. + +- **Experiment with Block Size:** Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures. + +- **Evaluate GPU Occupancy:** Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal. + +- **Memory Coalescing:** Ensure that memory accesses are coalesced for `vector_add`, which can significantly impact performance. + +- **Consider Multi-Stream Execution:** If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer. + +By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware. \ No newline at end of file diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_memory_operations_20250423_234838.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_memory_operations_20250423_234838.md new file mode 100644 index 0000000..8227708 --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_memory_operations_20250423_234838.md @@ -0,0 +1,38 @@ +## Analysis of CUDA Memory Operations + +### 1. Assessment of Memory Transfer Patterns and Their Efficiency + +The data suggests that `cudaMemcpy` operations account for 40% of memory operations, while `cudaMemcpyAsync` comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete. + +#### Efficiency Analysis: +- **Synchronous Transfers (`cudaMemcpy`)**: Generally slower due to blocking behavior. +- **Asynchronous Transfers (`cudaMemcpyAsync`)**: More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer. + +### 2. Analysis of the Balance Between Different Types of Memory Operations + +All four types of operations (`cudaMemcpy`, `cudaMemcpyAsync`, `cudaMalloc`, and `cudaFree`) are represented, but there is a notable imbalance with a high proportion of `cudaMemcpy`. Allocation and deallocation (`cudaMalloc` and `cudaFree`) operations are equally distributed at 20% each. + +The data skew towards `cudaMemcpy` might suggest missed opportunities for optimization using asynchronous transfers. + +### 3. Identification of Potential Memory-Related Bottlenecks + +- **Potential Bottleneck**: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations. +- **Allocation and Deallocation**: Frequent and possibly unnecessary calls to `cudaMalloc` and `cudaFree` can also cause performance hits. These should be minimized and reused when possible. + +### 4. Recommendations for Optimizing Memory Usage and Transfers + +1. **Increase Asynchronous Transfers**: Consider increasing the use of `cudaMemcpyAsync` to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU. + +2. **Optimize Memory Allocation**: + - Reuse memory allocations wherever possible instead of frequent malloc and free calls. + - Consider using memory pools to manage small allocations which can reduce overhead. + +3. **Streamlining the Memory Transfer**: + - Batch smaller data transfers into fewer, larger transfers to reduce the number of `cudaMemcpy` calls. + - Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers. + +4. **Profile and Monitor**: + - Regularly profile the application to identify specific points of inefficiency. + - Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps. + +By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications. \ No newline at end of file diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_optimization_recommendations_20250423_234928.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_optimization_recommendations_20250423_234928.md new file mode 100644 index 0000000..367506b --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_optimization_recommendations_20250423_234928.md @@ -0,0 +1,59 @@ +To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact: + +### 1. Code-Level Optimizations + +#### a. Kernel Execution + +**Issue:** Kernel execution time is high due to inefficient code. +- **Recommendation:** Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- **Example:** Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency. + +#### b. Memory Access Patterns + +**Issue:** Non-coalesced memory accesses leading to increased latency. +- **Recommendation:** Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- **Example:** If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access. + +#### c. Instruction Throughput + +**Issue:** Low instruction throughput. +- **Recommendation:** Utilize intrinsic functions specific to CUDA like `__sinf`, `__expf` for trigonometric or exponential functions to increase math operation throughput. +- **Example:** Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable. + +### 2. Architectural Changes + +#### a. Grid and Block Configuration + +**Issue:** Suboptimal grid and block configuration leading to low occupancy. +- **Recommendation:** Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- **Example:** If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code. + +#### b. Memory Hierarchy Utilization + +**Issue:** Underutilization of shared memory and cache. +- **Recommendation:** Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- **Example:** For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality. + +### 3. Alternative Approaches or CUDA Features + +#### a. Asynchronous Execution + +**Issue:** Sequential execution of memory transfers and kernel executions. +- **Recommendation:** Leverage CUDA streams to overlap computation with memory transfers. Use `cudaMemcpyAsync` to perform asynchronous data transfers between host and device. +- **Example:** Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations. + +#### b. Unified Memory + +**Issue:** Complex data management between host and device. +- **Recommendation:** Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- **Example:** Using `cudaMallocManaged` allows the system to automatically manage memory residency, although this may not provide the best performance in every case. + +### 4. Prioritization of Recommendations + +1. **Memory Access Patterns:** Ensuring coalesced access usually provides immediate and significant benefits. +2. **Grid and Block Configuration:** Properly configuring these can significantly impact occupancy and thus performance. +3. **Kernel Execution:** Reducing divergence and using efficient math operations can yield noticeable improvements. +4. **Asynchronous Execution:** Overlapping data transfer and execution increases pipeline efficiency. +5. **Unified Memory:** Provides ease of use, though hardware limitations might dictate otherwise. + +These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice. \ No newline at end of file diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_overview_20250423_234810.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_overview_20250423_234810.md new file mode 100644 index 0000000..44d8873 --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_overview_20250423_234810.md @@ -0,0 +1,23 @@ +## Executive Summary of CUDA Trace Data + +### 1. Main Characteristics of the Application + +The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations. + +### 2. Significant Patterns Observed + +- **API Distribution**: The API call distribution shows that `cudaLaunchKernel` is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity. +- **Memory Operations**: Frequent `cudaMemcpy` calls suggest significant host-device memory transfers. +- **Temporal Analysis**: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks. + +### 3. Key Performance Considerations + +- **Synchronization**: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points. +- **Memory-Launch Ratio**: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation. +- **Launch Configuration**: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance. + +### 4. Assessment of CUDA Implementation Quality + +Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads. + +In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations. \ No newline at end of file diff --git a/llm-sample-results/analysis/llm_analysis/llm_analysis_performance_bottlenecks_20250423_234912.md b/llm-sample-results/analysis/llm_analysis/llm_analysis_performance_bottlenecks_20250423_234912.md new file mode 100644 index 0000000..607bf6d --- /dev/null +++ b/llm-sample-results/analysis/llm_analysis/llm_analysis_performance_bottlenecks_20250423_234912.md @@ -0,0 +1,65 @@ +To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context: + +### 1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance + +#### Memory Transfer Overhead + +- **Explanation**: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times. + +- **Impact**: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation. + +#### Excessive Synchronization + +- **Explanation**: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points. + +- **Impact**: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA. + +### 2. Root Cause Analysis for Each Bottleneck + +#### Memory Transfer Overhead + +- **Root Causes**: + - Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations. + - Frequent small transfers instead of fewer batched transactions. + - Inefficient data management strategies causing frequent data transfers between the host and the GPU. + +#### Excessive Synchronization + +- **Root Causes**: + - Over-reliance on synchronization functions like `cudaDeviceSynchronize()`, resulting in unnecessary wait times. + - Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently. + - Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU. + +### 3. Prioritized Recommendations for Addressing Each Bottleneck + +#### Memory Transfer Overhead + +1. **Use Pinned Memory**: + - Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU. + +2. **Batch Data Transfers**: + - Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations. + +3. **Retain Data on GPU**: + - Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device. + +#### Excessive Synchronization + +1. **Optimize Use of CUDA Streams**: + - Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers. + +2. **Reduce Synchronization Points**: + - Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance. + +3. **Algorithm Redesign**: + - Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization. + +### 4. Potential Performance Gains from Implementing the Recommendations + +- **Expected Gains from Reducing Memory Transfer Overhead**: + - By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency. + +- **Expected Gains from Addressing Excessive Synchronization**: + - Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources. + +Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing. \ No newline at end of file diff --git a/llm-sample-results/analysis/memory_operations.png b/llm-sample-results/analysis/memory_operations.png new file mode 100644 index 0000000..db73725 Binary files /dev/null and b/llm-sample-results/analysis/memory_operations.png differ diff --git a/llm-sample-results/analysis/memory_operations_timeline.png b/llm-sample-results/analysis/memory_operations_timeline.png new file mode 100644 index 0000000..ab7f0a6 Binary files /dev/null and b/llm-sample-results/analysis/memory_operations_timeline.png differ diff --git a/llm-sample-results/enhanced/cuda_trace_dashboard.png b/llm-sample-results/enhanced/cuda_trace_dashboard.png new file mode 100644 index 0000000..3bf9320 Binary files /dev/null and b/llm-sample-results/enhanced/cuda_trace_dashboard.png differ diff --git a/llm-sample-results/enhanced/enhanced_api_call_timeline.png b/llm-sample-results/enhanced/enhanced_api_call_timeline.png new file mode 100644 index 0000000..75f2162 Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_api_call_timeline.png differ diff --git a/llm-sample-results/enhanced/enhanced_api_distribution.png b/llm-sample-results/enhanced/enhanced_api_distribution.png new file mode 100644 index 0000000..43bf31d Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_api_distribution.png differ diff --git a/llm-sample-results/enhanced/enhanced_api_time_heatmap.png b/llm-sample-results/enhanced/enhanced_api_time_heatmap.png new file mode 100644 index 0000000..b7633eb Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_api_time_heatmap.png differ diff --git a/llm-sample-results/enhanced/enhanced_block_dimensions.png b/llm-sample-results/enhanced/enhanced_block_dimensions.png new file mode 100644 index 0000000..ac92538 Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_block_dimensions.png differ diff --git a/llm-sample-results/enhanced/enhanced_call_site_api_heatmap.png b/llm-sample-results/enhanced/enhanced_call_site_api_heatmap.png new file mode 100644 index 0000000..65cfadb Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_call_site_api_heatmap.png differ diff --git a/llm-sample-results/enhanced/enhanced_call_site_distribution.png b/llm-sample-results/enhanced/enhanced_call_site_distribution.png new file mode 100644 index 0000000..74aa202 Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_call_site_distribution.png differ diff --git a/llm-sample-results/enhanced/enhanced_grid_dimensions.png b/llm-sample-results/enhanced/enhanced_grid_dimensions.png new file mode 100644 index 0000000..9e49964 Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_grid_dimensions.png differ diff --git a/llm-sample-results/enhanced/enhanced_kernel_name_distribution.png b/llm-sample-results/enhanced/enhanced_kernel_name_distribution.png new file mode 100644 index 0000000..7b3df92 Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_kernel_name_distribution.png differ diff --git a/llm-sample-results/enhanced/enhanced_memory_operations.png b/llm-sample-results/enhanced/enhanced_memory_operations.png new file mode 100644 index 0000000..1e26afb Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_memory_operations.png differ diff --git a/llm-sample-results/enhanced/enhanced_memory_operations_timeline.png b/llm-sample-results/enhanced/enhanced_memory_operations_timeline.png new file mode 100644 index 0000000..d1c748d Binary files /dev/null and b/llm-sample-results/enhanced/enhanced_memory_operations_timeline.png differ diff --git a/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_232202.html b/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_232202.html new file mode 100644 index 0000000..3955446 --- /dev/null +++ b/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_232202.html @@ -0,0 +1,239 @@ + + + + + CUDA Trace Analysis Report + + + +
+

CUDA Trace Analysis Report

+ +
+

Summary

+

Total trace entries: 660

+

Unique API functions: 6

+

Unique kernels: 1

+

Trace duration: 28.0 seconds

+
+ +

Dashboard

+
+ CUDA Trace Analysis Dashboard +
+ +

Individual Visualizations

+
+ +
+

Api Distribution

+ api_distribution.png +
+ +
+

Call Site Distribution

+ call_site_distribution.png +
+ +
+

Api Time Heatmap

+ api_time_heatmap.png +
+ +
+

Block Dimensions

+ block_dimensions.png +
+ +
+

Api Call Timeline

+ api_call_timeline.png +
+ +
+

Grid Dimensions

+ grid_dimensions.png +
+ +
+

Memory Operations Timeline

+ memory_operations_timeline.png +
+ +
+

Kernel Name Distribution

+ kernel_name_distribution.png +
+ +
+

Call Site Api Heatmap

+ call_site_api_heatmap.png +
+ +
+

Memory Operations

+ memory_operations.png +
+ +
+ +

Enhanced Visualizations

+
+ +
+

Enhanced Call Site Api Heatmap

+ enhanced_call_site_api_heatmap.png +
+ +
+

Enhanced Kernel Name Distribution

+ enhanced_kernel_name_distribution.png +
+ +
+

Enhanced Memory Operations

+ enhanced_memory_operations.png +
+ +
+

Enhanced Memory Operations Timeline

+ enhanced_memory_operations_timeline.png +
+ +
+

Enhanced Grid Dimensions

+ enhanced_grid_dimensions.png +
+ +
+

Enhanced Api Distribution

+ enhanced_api_distribution.png +
+ +
+

Enhanced Api Call Timeline

+ enhanced_api_call_timeline.png +
+ +
+

Enhanced Block Dimensions

+ enhanced_block_dimensions.png +
+ +
+

Enhanced Call Site Distribution

+ enhanced_call_site_distribution.png +
+ +
+

Enhanced Api Time Heatmap

+ enhanced_api_time_heatmap.png +
+ +
+ +

API Distribution

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCount
cudaLaunchKernel300
cudaMemcpy120
cudaMemcpyAsync60
cudaStreamSynchronize60
cudaMalloc60
cudaFree60
+ +
+ + + \ No newline at end of file diff --git a/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_234800.html b/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_234800.html new file mode 100644 index 0000000..3955446 --- /dev/null +++ b/llm-sample-results/enhanced/html_report/cuda_trace_analysis_report_20250423_234800.html @@ -0,0 +1,239 @@ + + + + + CUDA Trace Analysis Report + + + +
+

CUDA Trace Analysis Report

+ +
+

Summary

+

Total trace entries: 660

+

Unique API functions: 6

+

Unique kernels: 1

+

Trace duration: 28.0 seconds

+
+ +

Dashboard

+
+ CUDA Trace Analysis Dashboard +
+ +

Individual Visualizations

+
+ +
+

Api Distribution

+ api_distribution.png +
+ +
+

Call Site Distribution

+ call_site_distribution.png +
+ +
+

Api Time Heatmap

+ api_time_heatmap.png +
+ +
+

Block Dimensions

+ block_dimensions.png +
+ +
+

Api Call Timeline

+ api_call_timeline.png +
+ +
+

Grid Dimensions

+ grid_dimensions.png +
+ +
+

Memory Operations Timeline

+ memory_operations_timeline.png +
+ +
+

Kernel Name Distribution

+ kernel_name_distribution.png +
+ +
+

Call Site Api Heatmap

+ call_site_api_heatmap.png +
+ +
+

Memory Operations

+ memory_operations.png +
+ +
+ +

Enhanced Visualizations

+
+ +
+

Enhanced Call Site Api Heatmap

+ enhanced_call_site_api_heatmap.png +
+ +
+

Enhanced Kernel Name Distribution

+ enhanced_kernel_name_distribution.png +
+ +
+

Enhanced Memory Operations

+ enhanced_memory_operations.png +
+ +
+

Enhanced Memory Operations Timeline

+ enhanced_memory_operations_timeline.png +
+ +
+

Enhanced Grid Dimensions

+ enhanced_grid_dimensions.png +
+ +
+

Enhanced Api Distribution

+ enhanced_api_distribution.png +
+ +
+

Enhanced Api Call Timeline

+ enhanced_api_call_timeline.png +
+ +
+

Enhanced Block Dimensions

+ enhanced_block_dimensions.png +
+ +
+

Enhanced Call Site Distribution

+ enhanced_call_site_distribution.png +
+ +
+

Enhanced Api Time Heatmap

+ enhanced_api_time_heatmap.png +
+ +
+ +

API Distribution

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCount
cudaLaunchKernel300
cudaMemcpy120
cudaMemcpyAsync60
cudaStreamSynchronize60
cudaMalloc60
cudaFree60
+ +
+ + + \ No newline at end of file diff --git a/llm-sample-results/final_report.md b/llm-sample-results/final_report.md new file mode 100644 index 0000000..5960e17 --- /dev/null +++ b/llm-sample-results/final_report.md @@ -0,0 +1,329 @@ +# CUDA Trace Analysis Report + +## Summary + +- Total trace entries: 660 +- Unique API functions: 6 +- Unique kernels: 1 +- Trace duration: 28.0 seconds + +## Analysis Results + +# CUDA Trace Analysis - Combined Report + +## Table of Contents + +- [Overview](#overview) +- [Api Distribution](#api-distribution) +- [Memory Operations](#memory-operations) +- [Kernel Launches](#kernel-launches) +- [Performance Bottlenecks](#performance-bottlenecks) +- [Optimization Recommendations](#optimization-recommendations) + +## Overview + + +### 1. Main Characteristics of the Application + +The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations. + +### 2. Significant Patterns Observed + +- **API Distribution**: The API call distribution shows that `cudaLaunchKernel` is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity. +- **Memory Operations**: Frequent `cudaMemcpy` calls suggest significant host-device memory transfers. +- **Temporal Analysis**: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks. + +### 3. Key Performance Considerations + +- **Synchronization**: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points. +- **Memory-Launch Ratio**: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation. +- **Launch Configuration**: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance. + +### 4. Assessment of CUDA Implementation Quality + +Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads. + +In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations. + +## Api Distribution + +Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data: + +### 1. Most Frequently Used CUDA API Functions + +- **cudaLaunchKernel (45.45%)**: This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations. + +- **cudaMemcpy (18.18%)**: This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized. + +### 2. Balance Between Different Types of Operations + +- **Compute (cudaLaunchKernel)**: Dominates the API distribution, showing the application’s reliance on GPU computation. + +- **Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree)**: + - **cudaMemcpy and cudaMemcpyAsync (27.27% combined)**: Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management. + - **cudaMalloc and cudaFree (18.18% combined)**: Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic. + +- **Synchronization (cudaStreamSynchronize - 9.09%)**: This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance. + +### 3. Unusual or Inefficient API Usage Patterns + +- **Frequent cudaMalloc and cudaFree**: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance. + +- **High Usage of cudaMemcpy**: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation. + +### 4. Recommendations for Optimizing API Usage + +- **Optimize Memory Transfers**: + - Use asynchronous memory copies (`cudaMemcpyAsync`) more extensively to overlap data transfer and kernel execution. + - Batch data transfers or increase data granularity to reduce the number of transfer operations. + +- **Improve Memory Management**: + - Reduce frequent calls to `cudaMalloc` and `cudaFree` by reusing allocated memory wherever possible. + - Consider using memory pools or pre-allocating buffer spaces. + +- **Kernel Optimization**: + - Ensure that there is no significant idle time between kernel executions. + - Profile kernels to find any computation bottlenecks. + +- **Reduce Synchronization Overhead**: + - Minimize the use of `cudaStreamSynchronize` by managing dependencies and using streams effectively to overlap operations. + +By addressing these areas, the application can improve its overall execution efficiency on the GPU. + +## Memory Operations + + +### 1. Assessment of Memory Transfer Patterns and Their Efficiency + +The data suggests that `cudaMemcpy` operations account for 40% of memory operations, while `cudaMemcpyAsync` comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete. + +#### Efficiency Analysis: +- **Synchronous Transfers (`cudaMemcpy`)**: Generally slower due to blocking behavior. +- **Asynchronous Transfers (`cudaMemcpyAsync`)**: More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer. + +### 2. Analysis of the Balance Between Different Types of Memory Operations + +All four types of operations (`cudaMemcpy`, `cudaMemcpyAsync`, `cudaMalloc`, and `cudaFree`) are represented, but there is a notable imbalance with a high proportion of `cudaMemcpy`. Allocation and deallocation (`cudaMalloc` and `cudaFree`) operations are equally distributed at 20% each. + +The data skew towards `cudaMemcpy` might suggest missed opportunities for optimization using asynchronous transfers. + +### 3. Identification of Potential Memory-Related Bottlenecks + +- **Potential Bottleneck**: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations. +- **Allocation and Deallocation**: Frequent and possibly unnecessary calls to `cudaMalloc` and `cudaFree` can also cause performance hits. These should be minimized and reused when possible. + +### 4. Recommendations for Optimizing Memory Usage and Transfers + +1. **Increase Asynchronous Transfers**: Consider increasing the use of `cudaMemcpyAsync` to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU. + +2. **Optimize Memory Allocation**: + - Reuse memory allocations wherever possible instead of frequent malloc and free calls. + - Consider using memory pools to manage small allocations which can reduce overhead. + +3. **Streamlining the Memory Transfer**: + - Batch smaller data transfers into fewer, larger transfers to reduce the number of `cudaMemcpy` calls. + - Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers. + +4. **Profile and Monitor**: + - Regularly profile the application to identify specific points of inefficiency. + - Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps. + +By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications. + +## Kernel Launches + + +### 1. Assessment of Kernel Launch Patterns and Their Implications for Performance + +The kernel launch data shows that there is only one type of kernel, `vector_add`, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization. + +The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities. + +### 2. Analysis of Grid and Block Dimensions + +**Grid Dimensions:** +- `grid_x` is consistently set at 4096, while `grid_y` and `grid_z` have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing. + +**Block Dimensions:** +- `block_x` is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- `block_y` and `block_z` are set to 1, reinforcing that the computation is handled in a one-dimensional array. + +### 3. Evaluation of Kernel Occupancy and Efficiency + +Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory. + +### 4. Recommendations for Optimizing Kernel Launch Configurations + +- **Diversify Workload:** If possible, consider diversifying computational tasks to balance load and better utilize GPU resources. + +- **Experiment with Block Size:** Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures. + +- **Evaluate GPU Occupancy:** Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal. + +- **Memory Coalescing:** Ensure that memory accesses are coalesced for `vector_add`, which can significantly impact performance. + +- **Consider Multi-Stream Execution:** If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer. + +By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware. + +## Performance Bottlenecks + +To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context: + +### 1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance + +#### Memory Transfer Overhead + +- **Explanation**: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times. + +- **Impact**: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation. + +#### Excessive Synchronization + +- **Explanation**: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points. + +- **Impact**: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA. + +### 2. Root Cause Analysis for Each Bottleneck + +#### Memory Transfer Overhead + +- **Root Causes**: + - Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations. + - Frequent small transfers instead of fewer batched transactions. + - Inefficient data management strategies causing frequent data transfers between the host and the GPU. + +#### Excessive Synchronization + +- **Root Causes**: + - Over-reliance on synchronization functions like `cudaDeviceSynchronize()`, resulting in unnecessary wait times. + - Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently. + - Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU. + +### 3. Prioritized Recommendations for Addressing Each Bottleneck + +#### Memory Transfer Overhead + +1. **Use Pinned Memory**: + - Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU. + +2. **Batch Data Transfers**: + - Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations. + +3. **Retain Data on GPU**: + - Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device. + +#### Excessive Synchronization + +1. **Optimize Use of CUDA Streams**: + - Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers. + +2. **Reduce Synchronization Points**: + - Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance. + +3. **Algorithm Redesign**: + - Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization. + +### 4. Potential Performance Gains from Implementing the Recommendations + +- **Expected Gains from Reducing Memory Transfer Overhead**: + - By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency. + +- **Expected Gains from Addressing Excessive Synchronization**: + - Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources. + +Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing. + +## Optimization Recommendations + +To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact: + +### 1. Code-Level Optimizations + +#### a. Kernel Execution + +**Issue:** Kernel execution time is high due to inefficient code. +- **Recommendation:** Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- **Example:** Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency. + +#### b. Memory Access Patterns + +**Issue:** Non-coalesced memory accesses leading to increased latency. +- **Recommendation:** Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- **Example:** If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access. + +#### c. Instruction Throughput + +**Issue:** Low instruction throughput. +- **Recommendation:** Utilize intrinsic functions specific to CUDA like `__sinf`, `__expf` for trigonometric or exponential functions to increase math operation throughput. +- **Example:** Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable. + +### 2. Architectural Changes + +#### a. Grid and Block Configuration + +**Issue:** Suboptimal grid and block configuration leading to low occupancy. +- **Recommendation:** Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- **Example:** If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code. + +#### b. Memory Hierarchy Utilization + +**Issue:** Underutilization of shared memory and cache. +- **Recommendation:** Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- **Example:** For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality. + +### 3. Alternative Approaches or CUDA Features + +#### a. Asynchronous Execution + +**Issue:** Sequential execution of memory transfers and kernel executions. +- **Recommendation:** Leverage CUDA streams to overlap computation with memory transfers. Use `cudaMemcpyAsync` to perform asynchronous data transfers between host and device. +- **Example:** Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations. + +#### b. Unified Memory + +**Issue:** Complex data management between host and device. +- **Recommendation:** Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- **Example:** Using `cudaMallocManaged` allows the system to automatically manage memory residency, although this may not provide the best performance in every case. + +### 4. Prioritization of Recommendations + +1. **Memory Access Patterns:** Ensuring coalesced access usually provides immediate and significant benefits. +2. **Grid and Block Configuration:** Properly configuring these can significantly impact occupancy and thus performance. +3. **Kernel Execution:** Reducing divergence and using efficient math operations can yield noticeable improvements. +4. **Asynchronous Execution:** Overlapping data transfer and execution increases pipeline efficiency. +5. **Unified Memory:** Provides ease of use, though hardware limitations might dictate otherwise. + +These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice. + + + +--- + +Generated on: 2025-04-23 23:49:28 + + +## Visualizations + +The following visualizations are available in the analysis directory: + +- API Distribution: /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/analysis/api_distribution.png +- Kernel Name Distribution: /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/analysis/kernel_name_distribution.png +- Memory Operations: /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/analysis/memory_operations.png +- API Call Timeline: /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/analysis/api_call_timeline.png + +Enhanced visualizations are available in the enhanced directory: + +- CUDA Trace Dashboard: /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/enhanced/cuda_trace_dashboard.png + +## HTML Report + +An interactive HTML report is available at: + +/efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/MANUS/FINAL/cuda_trace_analysis_final_fix/cuda_analysis/analysis/html_report/cuda_trace_analysis_report_20250423_234928.html + +## Generated on + +2025-04-23 23:49:28 diff --git a/llm-sample-results/parsed_trace.json b/llm-sample-results/parsed_trace.json new file mode 100644 index 0000000..a84fb96 --- /dev/null +++ b/llm-sample-results/parsed_trace.json @@ -0,0 +1,16082 @@ +[ + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:40 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:40", + "relative_time": 2.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:42 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:42", + "relative_time": 4.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:44 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:44", + "relative_time": 6.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:46 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:46", + "relative_time": 8.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:48 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:48", + "relative_time": 10.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:50 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:50", + "relative_time": 12.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:52 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:52", + "relative_time": 14.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:54 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:54", + "relative_time": 16.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:56 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:56", + "relative_time": 18.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:24:58 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:24:58", + "relative_time": 20.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:00 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:00", + "relative_time": 22.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:02 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:02", + "relative_time": 24.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:04 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:04", + "relative_time": 26.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:06 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:06", + "relative_time": 28.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 996147200, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1045777616", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296720", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055144304", + "block_y": 24243, + "block_z": "-1055296720", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 325058560, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1035793696", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055293536", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1055340144", + "block_y": 24243, + "block_z": "-1055293536", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-346030080", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1025097136", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055278112", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054957296", + "block_y": 24243, + "block_z": "-1055278112", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1656750096, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055161152", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1650458640, + "grid_y": 28909, + "grid_z": 4194304, + "block_x": 1, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaMemcpy", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpy", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 4096, + "grid_y": 1, + "grid_z": 1, + "block_x": 256, + "block_y": 1, + "block_z": 1, + "stack_trace": [ + "cudaLaunchKernel", + "__device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int)", + "vector_add(float*, float*, float*, int)", + "main", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaLaunchKernel", + "args": "vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0", + "kernel_name": "vector_add", + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": "-1017118720", + "grid_y": 28908, + "grid_z": 4194304, + "block_x": 2, + "block_y": 0, + "block_z": "-1014366304", + "stack_trace": [ + "cudaMemcpyAsync", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMemcpyAsync", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 0, + "grid_y": 0, + "grid_z": 1, + "block_x": 0, + "block_y": 0, + "block_z": "-1055296960", + "stack_trace": [ + "cudaStreamSynchronize", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaStreamSynchronize", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 1024, + "grid_y": 0, + "grid_z": 0, + "block_x": 1, + "block_y": 0, + "block_z": "-1", + "stack_trace": [ + "cudaMalloc", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaMalloc", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + }, + { + "timestamp": "Sat Apr 19 07:25:08 2025", + "process_name": "test_cuda_api_m", + "pid": 3359096, + "event_type": 0, + "cuda_call": "CUDA_LAUNCH_KERNEL", + "grid_x": 201327106, + "grid_y": 8388610, + "grid_z": 28, + "block_x": "-1054917056", + "block_y": 24243, + "block_z": "-1055296960", + "stack_trace": [ + "cudaFree", + "__libc_start_main", + "_start" + ], + "cuda_api_function": "cudaFree", + "args": NaN, + "kernel_name": NaN, + "datetime": "2025-04-19 07:25:08", + "relative_time": 30.0 + } +] \ No newline at end of file diff --git a/llm-sample-results/sample_analysis_report.html b/llm-sample-results/sample_analysis_report.html new file mode 100644 index 0000000..1c40f8e --- /dev/null +++ b/llm-sample-results/sample_analysis_report.html @@ -0,0 +1,686 @@ + + + + + + + CUDA Trace Analysis Report + + + + +

CUDA Trace Analysis Report

+ + + + +
+

CUDA Trace Dashboard

+ + CUDA Trace Dashboard + +
+ + +
+

Overview

+
+
+

Executive Summary of CUDA Trace Data

+

1. Main Characteristics of the Application

+

The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations.

+

2. Significant Patterns Observed

+
    +
  • API Distribution: The API call distribution shows that cudaLaunchKernel is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity.
  • +
  • Memory Operations: Frequent cudaMemcpy calls suggest significant host-device memory transfers.
  • +
  • Temporal Analysis: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks.
  • +
+

3. Key Performance Considerations

+
    +
  • Synchronization: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points.
  • +
  • Memory-Launch Ratio: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation.
  • +
  • Launch Configuration: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance.
  • +
+

4. Assessment of CUDA Implementation Quality

+

Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads.

+

In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations.

+
+
+ + CUDA Trace Dashboard + +
+
+
+ + +
+

API Distribution

+
+
+

Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data:

+

1. Most Frequently Used CUDA API Functions

+
    +
  • +

    cudaLaunchKernel (45.45%): This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations.

    +
  • +
  • +

    cudaMemcpy (18.18%): This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized.

    +
  • +
+

2. Balance Between Different Types of Operations

+
    +
  • +

    Compute (cudaLaunchKernel): Dominates the API distribution, showing the application’s reliance on GPU computation.

    +
  • +
  • +

    Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree):

    +
  • +
  • cudaMemcpy and cudaMemcpyAsync (27.27% combined): Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management.
  • +
  • +

    cudaMalloc and cudaFree (18.18% combined): Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic.

    +
  • +
  • +

    Synchronization (cudaStreamSynchronize - 9.09%): This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance.

    +
  • +
+

3. Unusual or Inefficient API Usage Patterns

+
    +
  • +

    Frequent cudaMalloc and cudaFree: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance.

    +
  • +
  • +

    High Usage of cudaMemcpy: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation.

    +
  • +
+

4. Recommendations for Optimizing API Usage

+
    +
  • Optimize Memory Transfers:
  • +
  • Use asynchronous memory copies (cudaMemcpyAsync) more extensively to overlap data transfer and kernel execution.
  • +
  • +

    Batch data transfers or increase data granularity to reduce the number of transfer operations.

    +
  • +
  • +

    Improve Memory Management:

    +
  • +
  • Reduce frequent calls to cudaMalloc and cudaFree by reusing allocated memory wherever possible.
  • +
  • +

    Consider using memory pools or pre-allocating buffer spaces.

    +
  • +
  • +

    Kernel Optimization:

    +
  • +
  • Ensure that there is no significant idle time between kernel executions.
  • +
  • +

    Profile kernels to find any computation bottlenecks.

    +
  • +
  • +

    Reduce Synchronization Overhead:

    +
  • +
  • Minimize the use of cudaStreamSynchronize by managing dependencies and using streams effectively to overlap operations.
  • +
+

By addressing these areas, the application can improve its overall execution efficiency on the GPU.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCountPercentage
cudaLaunchKernel30045.45
cudaMemcpy12018.18
cudaMemcpyAsync609.09
cudaStreamSynchronize609.09
cudaMalloc609.09
cudaFree609.09
+
+ +
+
+ + API Distribution + +
+
+
+ + +
+

Memory Operations

+
+
+

Analysis of CUDA Memory Operations

+

1. Assessment of Memory Transfer Patterns and Their Efficiency

+

The data suggests that cudaMemcpy operations account for 40% of memory operations, while cudaMemcpyAsync comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete.

+

Efficiency Analysis:

+
    +
  • Synchronous Transfers (cudaMemcpy): Generally slower due to blocking behavior.
  • +
  • Asynchronous Transfers (cudaMemcpyAsync): More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer.
  • +
+

2. Analysis of the Balance Between Different Types of Memory Operations

+

All four types of operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, and cudaFree) are represented, but there is a notable imbalance with a high proportion of cudaMemcpy. Allocation and deallocation (cudaMalloc and cudaFree) operations are equally distributed at 20% each.

+

The data skew towards cudaMemcpy might suggest missed opportunities for optimization using asynchronous transfers.

+

3. Identification of Potential Memory-Related Bottlenecks

+
    +
  • Potential Bottleneck: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations.
  • +
  • Allocation and Deallocation: Frequent and possibly unnecessary calls to cudaMalloc and cudaFree can also cause performance hits. These should be minimized and reused when possible.
  • +
+

4. Recommendations for Optimizing Memory Usage and Transfers

+
    +
  1. +

    Increase Asynchronous Transfers: Consider increasing the use of cudaMemcpyAsync to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU.

    +
  2. +
  3. +

    Optimize Memory Allocation:

    +
  4. +
  5. Reuse memory allocations wherever possible instead of frequent malloc and free calls.
  6. +
  7. +

    Consider using memory pools to manage small allocations which can reduce overhead.

    +
  8. +
  9. +

    Streamlining the Memory Transfer:

    +
  10. +
  11. Batch smaller data transfers into fewer, larger transfers to reduce the number of cudaMemcpy calls.
  12. +
  13. +

    Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers.

    +
  14. +
  15. +

    Profile and Monitor:

    +
  16. +
  17. Regularly profile the application to identify specific points of inefficiency.
  18. +
  19. Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps.
  20. +
+

By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory OperationCountPercentage
cudaMemcpy12040.0
cudaMemcpyAsync6020.0
cudaMalloc6020.0
cudaFree6020.0
+
+ +
+
+ + Memory Operations + +
+
+
+ + +
+

Kernel Launches

+
+
+

Analysis of CUDA Kernel Launch Patterns

+

1. Assessment of Kernel Launch Patterns and Their Implications for Performance

+

The kernel launch data shows that there is only one type of kernel, vector_add, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization.

+

The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities.

+

2. Analysis of Grid and Block Dimensions

+

Grid Dimensions: +- grid_x is consistently set at 4096, while grid_y and grid_z have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing.

+

Block Dimensions: +- block_x is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- block_y and block_z are set to 1, reinforcing that the computation is handled in a one-dimensional array.

+

3. Evaluation of Kernel Occupancy and Efficiency

+

Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory.

+

4. Recommendations for Optimizing Kernel Launch Configurations

+
    +
  • +

    Diversify Workload: If possible, consider diversifying computational tasks to balance load and better utilize GPU resources.

    +
  • +
  • +

    Experiment with Block Size: Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures.

    +
  • +
  • +

    Evaluate GPU Occupancy: Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal.

    +
  • +
  • +

    Memory Coalescing: Ensure that memory accesses are coalesced for vector_add, which can significantly impact performance.

    +
  • +
  • +

    Consider Multi-Stream Execution: If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer.

    +
  • +
+

By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware.

+ + + +
+ + + + + + + + + + + + + + + +
Kernel NameCountPercentage
vector_add300100.0
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ValueDimension12564096
0block_x0.0300.00.0
1block_y300.00.00.0
2block_z300.00.00.0
3grid_x0.00.0300.0
4grid_y300.00.00.0
5grid_z300.00.00.0
+
+ +
+
+ + Kernel Distribution + +
+
+
+ + +
+

Performance Bottlenecks

+
+
+

To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context:

+

1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance

+

Memory Transfer Overhead

+
    +
  • +

    Explanation: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times.

    +
  • +
  • +

    Impact: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation.

    +
  • +
+

Excessive Synchronization

+
    +
  • +

    Explanation: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points.

    +
  • +
  • +

    Impact: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA.

    +
  • +
+

2. Root Cause Analysis for Each Bottleneck

+

Memory Transfer Overhead

+
    +
  • Root Causes:
  • +
  • Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations.
  • +
  • Frequent small transfers instead of fewer batched transactions.
  • +
  • Inefficient data management strategies causing frequent data transfers between the host and the GPU.
  • +
+

Excessive Synchronization

+
    +
  • Root Causes:
  • +
  • Over-reliance on synchronization functions like cudaDeviceSynchronize(), resulting in unnecessary wait times.
  • +
  • Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently.
  • +
  • Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU.
  • +
+

3. Prioritized Recommendations for Addressing Each Bottleneck

+

Memory Transfer Overhead

+
    +
  1. Use Pinned Memory:
  2. +
  3. +

    Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU.

    +
  4. +
  5. +

    Batch Data Transfers:

    +
  6. +
  7. +

    Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations.

    +
  8. +
  9. +

    Retain Data on GPU:

    +
  10. +
  11. Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device.
  12. +
+

Excessive Synchronization

+
    +
  1. Optimize Use of CUDA Streams:
  2. +
  3. +

    Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers.

    +
  4. +
  5. +

    Reduce Synchronization Points:

    +
  6. +
  7. +

    Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance.

    +
  8. +
  9. +

    Algorithm Redesign:

    +
  10. +
  11. Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization.
  12. +
+

4. Potential Performance Gains from Implementing the Recommendations

+
    +
  • Expected Gains from Reducing Memory Transfer Overhead:
  • +
  • +

    By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency.

    +
  • +
  • +

    Expected Gains from Addressing Excessive Synchronization:

    +
  • +
  • Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources.
  • +
+

Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + +
BottleneckMetricSeverityRecommendation
Memory Transfer Overhead27.27% of operationsMediumUse pinned memory, batch transfers, or keep data on GPU longer
Excessive Synchronization60 sync operations (9.09%)MediumReduce synchronization points, use multiple streams for parallelism
+
+ +
+
+ + API Call Timeline + +
+
+
+ + +
+

Optimization Recommendations

+

To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact:

+

1. Code-Level Optimizations

+

a. Kernel Execution

+

Issue: Kernel execution time is high due to inefficient code. +- Recommendation: Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- Example: Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency.

+

b. Memory Access Patterns

+

Issue: Non-coalesced memory accesses leading to increased latency. +- Recommendation: Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- Example: If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access.

+

c. Instruction Throughput

+

Issue: Low instruction throughput. +- Recommendation: Utilize intrinsic functions specific to CUDA like __sinf, __expf for trigonometric or exponential functions to increase math operation throughput. +- Example: Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable.

+

2. Architectural Changes

+

a. Grid and Block Configuration

+

Issue: Suboptimal grid and block configuration leading to low occupancy. +- Recommendation: Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- Example: If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code.

+

b. Memory Hierarchy Utilization

+

Issue: Underutilization of shared memory and cache. +- Recommendation: Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- Example: For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality.

+

3. Alternative Approaches or CUDA Features

+

a. Asynchronous Execution

+

Issue: Sequential execution of memory transfers and kernel executions. +- Recommendation: Leverage CUDA streams to overlap computation with memory transfers. Use cudaMemcpyAsync to perform asynchronous data transfers between host and device. +- Example: Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations.

+

b. Unified Memory

+

Issue: Complex data management between host and device. +- Recommendation: Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- Example: Using cudaMallocManaged allows the system to automatically manage memory residency, although this may not provide the best performance in every case.

+

4. Prioritization of Recommendations

+
    +
  1. Memory Access Patterns: Ensuring coalesced access usually provides immediate and significant benefits.
  2. +
  3. Grid and Block Configuration: Properly configuring these can significantly impact occupancy and thus performance.
  4. +
  5. Kernel Execution: Reducing divergence and using efficient math operations can yield noticeable improvements.
  6. +
  7. Asynchronous Execution: Overlapping data transfer and execution increases pipeline efficiency.
  8. +
  9. Unified Memory: Provides ease of use, though hardware limitations might dictate otherwise.
  10. +
+

These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice.

+
+ +
+ Generated on: 2025-04-23 23:49:28 +
+ + + \ No newline at end of file diff --git a/sample-trace.out b/sample-trace.out new file mode 100644 index 0000000..3bc7c64 --- /dev/null +++ b/sample-trace.out @@ -0,0 +1,7546 @@ +Compiling test_cuda_api_multi_gpu.cu... +Compilation successful. +Starting test_cuda_api_multi_gpu... +CUDA process running with PID: 3359096 +Running gpuevent_snoop for 30 seconds... +Found Symbol cudaLaunchKernel at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaLaunchKernel at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x79d40 +Found Symbol cudaLaunchKernel at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68630 +Found Symbol cudaLaunchCooperativeKernel at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x7a2e0 +Found Symbol cudaLaunchCooperativeKernel at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68900 +Found Symbol cudaGraphLaunch at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x7be20 +Found Symbol cudaGraphLaunch at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x69c90 +Found Symbol cudaMalloc at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMalloc at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x577b0 +Found Symbol cudaMalloc at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4dc80 +Found Symbol cudaFree at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaFree at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x58050 +Found Symbol cudaFree at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4e3c0 +Found Symbol cudaMemcpy at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMemcpy at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x73df0 +Found Symbol cudaMemcpy at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x639e0 +Found Symbol cudaMemcpyAsync at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaMemcpyAsync at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x766f0 +Found Symbol cudaMemcpyAsync at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x659b0 +Found Symbol cudaStreamCreate at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamCreate at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x4f640 +Found Symbol cudaStreamCreate at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x47f10 +Found Symbol cudaStreamDestroy at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamDestroy at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x50ba0 +Found Symbol cudaStreamDestroy at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x48e40 +Found Symbol cudaStreamSynchronize at /efs/NFLX-GENAI-PROJECTS/GPUSNOOP/LLM/test_cuda_api_multi_gpu Offset: 0x0 +Found Symbol cudaStreamSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x79b50 +Found Symbol cudaStreamSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x68480 +Found Symbol cudaEventRecord at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x790c0 +Found Symbol cudaEventRecord at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x67b40 +Found Symbol cudaEventSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x535e0 +Found Symbol cudaEventSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4ae60 +Found Symbol cudaEventElapsedTime at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x53980 +Found Symbol cudaEventElapsedTime at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x4b1a0 +Found Symbol cudaDeviceSynchronize at /usr/local/cuda-12.8/targets/x86_64-linux/lib/libcudart.so.12.8.90 Offset: 0x4a310 +Found Symbol cudaDeviceSynchronize at /usr/lib/x86_64-linux-gnu/libcudart.so.11.5.117 Offset: 0x43aa0 +Started profiling at Sat Apr 19 07:24:38 2025 +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:40 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:42 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:44 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:46 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:48 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:50 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:52 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:54 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:56 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:24:58 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:00 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:02 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:04 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:06 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed3ae00000 float arg1=0x70ed3b200000 float arg2=0x70ed3b600000 float arg3=0x3b60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (996147200,28909,4194304), Block: (2,0,-1045777616) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296720) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055144304,24243,-1055296720) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ed12e00000 float arg1=0x70ed13200000 float arg2=0x70ed13600000 float arg3=0x1360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (325058560,28909,4194304), Block: (2,0,-1035793696) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055293536) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1055340144,24243,-1055293536) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70eceae00000 float arg1=0x70eceb200000 float arg2=0x70eceb600000 float arg3=0xeb60000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-346030080,28908,4194304), Block: (2,0,-1025097136) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055278112) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054957296,24243,-1055278112) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1656750096,28909,4194304), Block: (1,0,-1055161152) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1650458640,28909,4194304), Block: (1,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaMemcpy + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (4096,1,1), Block: (256,1,1) +[ARGS] vector_add arg0=0x70ecc2e00000 float arg1=0x70ecc3200000 float arg2=0x70ecc3600000 float arg3=0xc360000000100000 int arg4=0x0 +[STACK_TRACE] + cudaLaunchKernel + __device_stub__Z10vector_addPfS_S_i(float*, float*, float*, int) + vector_add(float*, float*, float*, int) + main + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (-1017118720,28908,4194304), Block: (2,0,-1014366304) +[ARGS] +[STACK_TRACE] + cudaMemcpyAsync + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (0,0,1), Block: (0,0,-1055296960) +[ARGS] +[STACK_TRACE] + cudaStreamSynchronize + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (1024,0,0), Block: (1,0,-1) +[ARGS] +[STACK_TRACE] + cudaMalloc + __libc_start_main + _start +-------------------------------------------------------------------------------- +[TIMESTAMP] Sat Apr 19 07:25:08 2025 + +[PROCESS] test_cuda_api_m [3359096] CUDA API EventType 0 +[CUDA_LAUNCH_KERNEL] Grid: (201327106,8388610,28), Block: (-1054917056,24243,-1055296960) +[ARGS] +[STACK_TRACE] + cudaFree + __libc_start_main + _start +-------------------------------------------------------------------------------- +Done Profiling: exceeded duration of 30s. +Stopped profiling at Sat Apr 19 07:25:09 2025 +Stopping CUDA program... +Tracing completed. diff --git a/sample_analysis_report.html b/sample_analysis_report.html new file mode 100644 index 0000000..1c40f8e --- /dev/null +++ b/sample_analysis_report.html @@ -0,0 +1,686 @@ + + + + + + + CUDA Trace Analysis Report + + + + +

CUDA Trace Analysis Report

+ + + + +
+

CUDA Trace Dashboard

+ + CUDA Trace Dashboard + +
+ + +
+

Overview

+
+
+

Executive Summary of CUDA Trace Data

+

1. Main Characteristics of the Application

+

The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations.

+

2. Significant Patterns Observed

+
    +
  • API Distribution: The API call distribution shows that cudaLaunchKernel is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity.
  • +
  • Memory Operations: Frequent cudaMemcpy calls suggest significant host-device memory transfers.
  • +
  • Temporal Analysis: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks.
  • +
+

3. Key Performance Considerations

+
    +
  • Synchronization: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points.
  • +
  • Memory-Launch Ratio: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation.
  • +
  • Launch Configuration: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance.
  • +
+

4. Assessment of CUDA Implementation Quality

+

Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads.

+

In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations.

+
+
+ + CUDA Trace Dashboard + +
+
+
+ + +
+

API Distribution

+
+
+

Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data:

+

1. Most Frequently Used CUDA API Functions

+
    +
  • +

    cudaLaunchKernel (45.45%): This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations.

    +
  • +
  • +

    cudaMemcpy (18.18%): This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized.

    +
  • +
+

2. Balance Between Different Types of Operations

+
    +
  • +

    Compute (cudaLaunchKernel): Dominates the API distribution, showing the application’s reliance on GPU computation.

    +
  • +
  • +

    Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree):

    +
  • +
  • cudaMemcpy and cudaMemcpyAsync (27.27% combined): Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management.
  • +
  • +

    cudaMalloc and cudaFree (18.18% combined): Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic.

    +
  • +
  • +

    Synchronization (cudaStreamSynchronize - 9.09%): This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance.

    +
  • +
+

3. Unusual or Inefficient API Usage Patterns

+
    +
  • +

    Frequent cudaMalloc and cudaFree: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance.

    +
  • +
  • +

    High Usage of cudaMemcpy: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation.

    +
  • +
+

4. Recommendations for Optimizing API Usage

+
    +
  • Optimize Memory Transfers:
  • +
  • Use asynchronous memory copies (cudaMemcpyAsync) more extensively to overlap data transfer and kernel execution.
  • +
  • +

    Batch data transfers or increase data granularity to reduce the number of transfer operations.

    +
  • +
  • +

    Improve Memory Management:

    +
  • +
  • Reduce frequent calls to cudaMalloc and cudaFree by reusing allocated memory wherever possible.
  • +
  • +

    Consider using memory pools or pre-allocating buffer spaces.

    +
  • +
  • +

    Kernel Optimization:

    +
  • +
  • Ensure that there is no significant idle time between kernel executions.
  • +
  • +

    Profile kernels to find any computation bottlenecks.

    +
  • +
  • +

    Reduce Synchronization Overhead:

    +
  • +
  • Minimize the use of cudaStreamSynchronize by managing dependencies and using streams effectively to overlap operations.
  • +
+

By addressing these areas, the application can improve its overall execution efficiency on the GPU.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCountPercentage
cudaLaunchKernel30045.45
cudaMemcpy12018.18
cudaMemcpyAsync609.09
cudaStreamSynchronize609.09
cudaMalloc609.09
cudaFree609.09
+
+ +
+
+ + API Distribution + +
+
+
+ + +
+

Memory Operations

+
+
+

Analysis of CUDA Memory Operations

+

1. Assessment of Memory Transfer Patterns and Their Efficiency

+

The data suggests that cudaMemcpy operations account for 40% of memory operations, while cudaMemcpyAsync comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete.

+

Efficiency Analysis:

+
    +
  • Synchronous Transfers (cudaMemcpy): Generally slower due to blocking behavior.
  • +
  • Asynchronous Transfers (cudaMemcpyAsync): More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer.
  • +
+

2. Analysis of the Balance Between Different Types of Memory Operations

+

All four types of operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, and cudaFree) are represented, but there is a notable imbalance with a high proportion of cudaMemcpy. Allocation and deallocation (cudaMalloc and cudaFree) operations are equally distributed at 20% each.

+

The data skew towards cudaMemcpy might suggest missed opportunities for optimization using asynchronous transfers.

+

3. Identification of Potential Memory-Related Bottlenecks

+
    +
  • Potential Bottleneck: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations.
  • +
  • Allocation and Deallocation: Frequent and possibly unnecessary calls to cudaMalloc and cudaFree can also cause performance hits. These should be minimized and reused when possible.
  • +
+

4. Recommendations for Optimizing Memory Usage and Transfers

+
    +
  1. +

    Increase Asynchronous Transfers: Consider increasing the use of cudaMemcpyAsync to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU.

    +
  2. +
  3. +

    Optimize Memory Allocation:

    +
  4. +
  5. Reuse memory allocations wherever possible instead of frequent malloc and free calls.
  6. +
  7. +

    Consider using memory pools to manage small allocations which can reduce overhead.

    +
  8. +
  9. +

    Streamlining the Memory Transfer:

    +
  10. +
  11. Batch smaller data transfers into fewer, larger transfers to reduce the number of cudaMemcpy calls.
  12. +
  13. +

    Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers.

    +
  14. +
  15. +

    Profile and Monitor:

    +
  16. +
  17. Regularly profile the application to identify specific points of inefficiency.
  18. +
  19. Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps.
  20. +
+

By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory OperationCountPercentage
cudaMemcpy12040.0
cudaMemcpyAsync6020.0
cudaMalloc6020.0
cudaFree6020.0
+
+ +
+
+ + Memory Operations + +
+
+
+ + +
+

Kernel Launches

+
+
+

Analysis of CUDA Kernel Launch Patterns

+

1. Assessment of Kernel Launch Patterns and Their Implications for Performance

+

The kernel launch data shows that there is only one type of kernel, vector_add, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization.

+

The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities.

+

2. Analysis of Grid and Block Dimensions

+

Grid Dimensions: +- grid_x is consistently set at 4096, while grid_y and grid_z have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing.

+

Block Dimensions: +- block_x is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- block_y and block_z are set to 1, reinforcing that the computation is handled in a one-dimensional array.

+

3. Evaluation of Kernel Occupancy and Efficiency

+

Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory.

+

4. Recommendations for Optimizing Kernel Launch Configurations

+
    +
  • +

    Diversify Workload: If possible, consider diversifying computational tasks to balance load and better utilize GPU resources.

    +
  • +
  • +

    Experiment with Block Size: Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures.

    +
  • +
  • +

    Evaluate GPU Occupancy: Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal.

    +
  • +
  • +

    Memory Coalescing: Ensure that memory accesses are coalesced for vector_add, which can significantly impact performance.

    +
  • +
  • +

    Consider Multi-Stream Execution: If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer.

    +
  • +
+

By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware.

+ + + +
+ + + + + + + + + + + + + + + +
Kernel NameCountPercentage
vector_add300100.0
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ValueDimension12564096
0block_x0.0300.00.0
1block_y300.00.00.0
2block_z300.00.00.0
3grid_x0.00.0300.0
4grid_y300.00.00.0
5grid_z300.00.00.0
+
+ +
+
+ + Kernel Distribution + +
+
+
+ + +
+

Performance Bottlenecks

+
+
+

To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context:

+

1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance

+

Memory Transfer Overhead

+
    +
  • +

    Explanation: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times.

    +
  • +
  • +

    Impact: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation.

    +
  • +
+

Excessive Synchronization

+
    +
  • +

    Explanation: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points.

    +
  • +
  • +

    Impact: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA.

    +
  • +
+

2. Root Cause Analysis for Each Bottleneck

+

Memory Transfer Overhead

+
    +
  • Root Causes:
  • +
  • Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations.
  • +
  • Frequent small transfers instead of fewer batched transactions.
  • +
  • Inefficient data management strategies causing frequent data transfers between the host and the GPU.
  • +
+

Excessive Synchronization

+
    +
  • Root Causes:
  • +
  • Over-reliance on synchronization functions like cudaDeviceSynchronize(), resulting in unnecessary wait times.
  • +
  • Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently.
  • +
  • Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU.
  • +
+

3. Prioritized Recommendations for Addressing Each Bottleneck

+

Memory Transfer Overhead

+
    +
  1. Use Pinned Memory:
  2. +
  3. +

    Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU.

    +
  4. +
  5. +

    Batch Data Transfers:

    +
  6. +
  7. +

    Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations.

    +
  8. +
  9. +

    Retain Data on GPU:

    +
  10. +
  11. Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device.
  12. +
+

Excessive Synchronization

+
    +
  1. Optimize Use of CUDA Streams:
  2. +
  3. +

    Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers.

    +
  4. +
  5. +

    Reduce Synchronization Points:

    +
  6. +
  7. +

    Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance.

    +
  8. +
  9. +

    Algorithm Redesign:

    +
  10. +
  11. Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization.
  12. +
+

4. Potential Performance Gains from Implementing the Recommendations

+
    +
  • Expected Gains from Reducing Memory Transfer Overhead:
  • +
  • +

    By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency.

    +
  • +
  • +

    Expected Gains from Addressing Excessive Synchronization:

    +
  • +
  • Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources.
  • +
+

Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + +
BottleneckMetricSeverityRecommendation
Memory Transfer Overhead27.27% of operationsMediumUse pinned memory, batch transfers, or keep data on GPU longer
Excessive Synchronization60 sync operations (9.09%)MediumReduce synchronization points, use multiple streams for parallelism
+
+ +
+
+ + API Call Timeline + +
+
+
+ + +
+

Optimization Recommendations

+

To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact:

+

1. Code-Level Optimizations

+

a. Kernel Execution

+

Issue: Kernel execution time is high due to inefficient code. +- Recommendation: Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- Example: Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency.

+

b. Memory Access Patterns

+

Issue: Non-coalesced memory accesses leading to increased latency. +- Recommendation: Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- Example: If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access.

+

c. Instruction Throughput

+

Issue: Low instruction throughput. +- Recommendation: Utilize intrinsic functions specific to CUDA like __sinf, __expf for trigonometric or exponential functions to increase math operation throughput. +- Example: Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable.

+

2. Architectural Changes

+

a. Grid and Block Configuration

+

Issue: Suboptimal grid and block configuration leading to low occupancy. +- Recommendation: Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- Example: If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code.

+

b. Memory Hierarchy Utilization

+

Issue: Underutilization of shared memory and cache. +- Recommendation: Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- Example: For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality.

+

3. Alternative Approaches or CUDA Features

+

a. Asynchronous Execution

+

Issue: Sequential execution of memory transfers and kernel executions. +- Recommendation: Leverage CUDA streams to overlap computation with memory transfers. Use cudaMemcpyAsync to perform asynchronous data transfers between host and device. +- Example: Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations.

+

b. Unified Memory

+

Issue: Complex data management between host and device. +- Recommendation: Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- Example: Using cudaMallocManaged allows the system to automatically manage memory residency, although this may not provide the best performance in every case.

+

4. Prioritization of Recommendations

+
    +
  1. Memory Access Patterns: Ensuring coalesced access usually provides immediate and significant benefits.
  2. +
  3. Grid and Block Configuration: Properly configuring these can significantly impact occupancy and thus performance.
  4. +
  5. Kernel Execution: Reducing divergence and using efficient math operations can yield noticeable improvements.
  6. +
  7. Asynchronous Execution: Overlapping data transfer and execution increases pipeline efficiency.
  8. +
  9. Unified Memory: Provides ease of use, though hardware limitations might dictate otherwise.
  10. +
+

These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice.

+
+ +
+ Generated on: 2025-04-23 23:49:28 +
+ + + \ No newline at end of file diff --git a/sample_llm_analysis_report.html b/sample_llm_analysis_report.html new file mode 100644 index 0000000..1c40f8e --- /dev/null +++ b/sample_llm_analysis_report.html @@ -0,0 +1,686 @@ + + + + + + + CUDA Trace Analysis Report + + + + +

CUDA Trace Analysis Report

+ + + + +
+

CUDA Trace Dashboard

+ + CUDA Trace Dashboard + +
+ + +
+

Overview

+
+
+

Executive Summary of CUDA Trace Data

+

1. Main Characteristics of the Application

+

The application utilizes a limited set of CUDA API functions, with six unique functions tracked throughout the trace period. The predominant activity involves kernel execution and memory operations, indicating a focus on computing tasks. The consistent use of a single kernel, "vector_add," suggests the application is specialized in a particular type of computation, likely vector addition or similar operations.

+

2. Significant Patterns Observed

+
    +
  • API Distribution: The API call distribution shows that cudaLaunchKernel is the most frequently used API, accounting for 45.5% of total calls. This highlights intensive kernel activity.
  • +
  • Memory Operations: Frequent cudaMemcpy calls suggest significant host-device memory transfers.
  • +
  • Temporal Analysis: There is a consistent distribution of CUDA API calls over time, with no significant spikes, implying stable performance without notable bottlenecks.
  • +
+

3. Key Performance Considerations

+
    +
  • Synchronization: The application has a high synchronization frequency (60 operations over the trace period), a potential area for optimization by reducing unnecessary synchronization points.
  • +
  • Memory-Launch Ratio: The memory copy to kernel launch ratio is 0.60, which indicates a healthy balance between data transfers and computation.
  • +
  • Launch Configuration: The kernel uses a common grid/block configuration, which may be efficient but could be further optimized based on specific hardware or workload to enhance performance.
  • +
+

4. Assessment of CUDA Implementation Quality

+

Overall, the CUDA implementation is effective but could benefit from optimizations. The use of consistent API calls and balanced memory operations are strengths. However, there is room for improvement in synchronization management and possibly the grid/block configuration to better utilize device capabilities and minimize overheads.

+

In summary, the application demonstrates a focused use of CUDA capabilities with potential for improved efficiency through targeted optimizations.

+
+
+ + CUDA Trace Dashboard + +
+
+
+ + +
+

API Distribution

+
+
+

Certainly! Here's a detailed analysis of the CUDA API distribution based on the provided data:

+

1. Most Frequently Used CUDA API Functions

+
    +
  • +

    cudaLaunchKernel (45.45%): This is the most frequently used API call, indicating that the application is heavily focused on executing GPU kernels. This suggests that the application is computation-intensive and utilizes GPU acceleration effectively to execute parallel operations.

    +
  • +
  • +

    cudaMemcpy (18.18%): This indicates significant data transfer between the host and device. High usage may suggest repeated data movement, which could become a performance bottleneck if not optimized.

    +
  • +
+

2. Balance Between Different Types of Operations

+
    +
  • +

    Compute (cudaLaunchKernel): Dominates the API distribution, showing the application’s reliance on GPU computation.

    +
  • +
  • +

    Memory Operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, cudaFree):

    +
  • +
  • cudaMemcpy and cudaMemcpyAsync (27.27% combined): Memory transfers are substantial but not overwhelming, indicating a reasonable balance in data management.
  • +
  • +

    cudaMalloc and cudaFree (18.18% combined): Frequent memory allocation and deallocation could indicate potential inefficiencies if allocations are too dynamic.

    +
  • +
  • +

    Synchronization (cudaStreamSynchronize - 9.09%): This suggests some level of synchronization is needed, but it isn't excessive, which generally is a good sign as excessive synchronization can hinder performance.

    +
  • +
+

3. Unusual or Inefficient API Usage Patterns

+
    +
  • +

    Frequent cudaMalloc and cudaFree: If these calls are repeated many times in a loop, it may indicate inefficiency in memory management. Allocating and deallocating memory in tight loops can significantly reduce performance.

    +
  • +
  • +

    High Usage of cudaMemcpy: Could be a potential area for optimization, such as ensuring maximum data transfer size per call or overlapping data transfers with computation.

    +
  • +
+

4. Recommendations for Optimizing API Usage

+
    +
  • Optimize Memory Transfers:
  • +
  • Use asynchronous memory copies (cudaMemcpyAsync) more extensively to overlap data transfer and kernel execution.
  • +
  • +

    Batch data transfers or increase data granularity to reduce the number of transfer operations.

    +
  • +
  • +

    Improve Memory Management:

    +
  • +
  • Reduce frequent calls to cudaMalloc and cudaFree by reusing allocated memory wherever possible.
  • +
  • +

    Consider using memory pools or pre-allocating buffer spaces.

    +
  • +
  • +

    Kernel Optimization:

    +
  • +
  • Ensure that there is no significant idle time between kernel executions.
  • +
  • +

    Profile kernels to find any computation bottlenecks.

    +
  • +
  • +

    Reduce Synchronization Overhead:

    +
  • +
  • Minimize the use of cudaStreamSynchronize by managing dependencies and using streams effectively to overlap operations.
  • +
+

By addressing these areas, the application can improve its overall execution efficiency on the GPU.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API FunctionCountPercentage
cudaLaunchKernel30045.45
cudaMemcpy12018.18
cudaMemcpyAsync609.09
cudaStreamSynchronize609.09
cudaMalloc609.09
cudaFree609.09
+
+ +
+
+ + API Distribution + +
+
+
+ + +
+

Memory Operations

+
+
+

Analysis of CUDA Memory Operations

+

1. Assessment of Memory Transfer Patterns and Their Efficiency

+

The data suggests that cudaMemcpy operations account for 40% of memory operations, while cudaMemcpyAsync comprises 20%. This indicates a heavy reliance on synchronous memory transfers, which can be less efficient as they may block the host thread until the copy is complete.

+

Efficiency Analysis:

+
    +
  • Synchronous Transfers (cudaMemcpy): Generally slower due to blocking behavior.
  • +
  • Asynchronous Transfers (cudaMemcpyAsync): More efficient when managed correctly as they do not block the host, allowing for overlap of computation and data transfer.
  • +
+

2. Analysis of the Balance Between Different Types of Memory Operations

+

All four types of operations (cudaMemcpy, cudaMemcpyAsync, cudaMalloc, and cudaFree) are represented, but there is a notable imbalance with a high proportion of cudaMemcpy. Allocation and deallocation (cudaMalloc and cudaFree) operations are equally distributed at 20% each.

+

The data skew towards cudaMemcpy might suggest missed opportunities for optimization using asynchronous transfers.

+

3. Identification of Potential Memory-Related Bottlenecks

+
    +
  • Potential Bottleneck: The high percentage of synchronous memory transfers suggests potential underutilization of the GPU’s ability to handle concurrent operations.
  • +
  • Allocation and Deallocation: Frequent and possibly unnecessary calls to cudaMalloc and cudaFree can also cause performance hits. These should be minimized and reused when possible.
  • +
+

4. Recommendations for Optimizing Memory Usage and Transfers

+
    +
  1. +

    Increase Asynchronous Transfers: Consider increasing the use of cudaMemcpyAsync to enable overlapping of memory transfer and computation. Utilize streams effectively to manage these operations without blocking the CPU.

    +
  2. +
  3. +

    Optimize Memory Allocation:

    +
  4. +
  5. Reuse memory allocations wherever possible instead of frequent malloc and free calls.
  6. +
  7. +

    Consider using memory pools to manage small allocations which can reduce overhead.

    +
  8. +
  9. +

    Streamlining the Memory Transfer:

    +
  10. +
  11. Batch smaller data transfers into fewer, larger transfers to reduce the number of cudaMemcpy calls.
  12. +
  13. +

    Ensure data alignment and coalesced access patterns to optimize bandwidth usage during transfers.

    +
  14. +
  15. +

    Profile and Monitor:

    +
  16. +
  17. Regularly profile the application to identify specific points of inefficiency.
  18. +
  19. Use CUDA profilers to monitor memory usage, transfer times, and kernel execution overlaps.
  20. +
+

By implementing these recommendations, you can potentially improve throughput and reduce latency in your CUDA applications.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory OperationCountPercentage
cudaMemcpy12040.0
cudaMemcpyAsync6020.0
cudaMalloc6020.0
cudaFree6020.0
+
+ +
+
+ + Memory Operations + +
+
+
+ + +
+

Kernel Launches

+
+
+

Analysis of CUDA Kernel Launch Patterns

+

1. Assessment of Kernel Launch Patterns and Their Implications for Performance

+

The kernel launch data shows that there is only one type of kernel, vector_add, being launched 300 times, making it a highly repetitive workload. This indicates that the application is computationally uniform, focusing intensely on vector addition. This uniformity might benefit from optimization to improve throughput and resource utilization.

+

The repetitive nature can lead to bottlenecks if this kernel doesn't fully utilize the GPU's capabilities.

+

2. Analysis of Grid and Block Dimensions

+

Grid Dimensions: +- grid_x is consistently set at 4096, while grid_y and grid_z have a constant value of 1. This configuration implies that the computation is primarily one-dimensional, with a vast number of elements needing processing.

+

Block Dimensions: +- block_x is always 256, indicating that each block processes 256 threads. The choice of 256 is often optimal as it's a multiple of the warp size (32 on most NVIDIA GPUs), allowing for more efficient execution. +- block_y and block_z are set to 1, reinforcing that the computation is handled in a one-dimensional array.

+

3. Evaluation of Kernel Occupancy and Efficiency

+

Kernel occupancy refers to how well the GPU's resources (especially warps) are utilized: +- With blocks of size 256 and grids of 4096, the resource utilization could be high if the GPU can handle this many threads per multiprocessor. However, without specific GPU details (e.g., SM count or available registers), precise occupancy cannot be calculated. +- High occupancy is desirable but must be balanced against register usage and shared memory.

+

4. Recommendations for Optimizing Kernel Launch Configurations

+
    +
  • +

    Diversify Workload: If possible, consider diversifying computational tasks to balance load and better utilize GPU resources.

    +
  • +
  • +

    Experiment with Block Size: Although 256 is often optimal, experimenting with different block sizes (e.g., 128, 512) might yield performance improvements on various architectures.

    +
  • +
  • +

    Evaluate GPU Occupancy: Use tools like NVIDIA Nsight Compute to analyze actual occupancy and resource usage, which can guide whether grid/block dimensions are optimal.

    +
  • +
  • +

    Memory Coalescing: Ensure that memory accesses are coalesced for vector_add, which can significantly impact performance.

    +
  • +
  • +

    Consider Multi-Stream Execution: If execution time is a concern, utilizing multiple CUDA streams could help in overlapping computation and data transfer.

    +
  • +
+

By understanding and tuning these parameters, performance improvements can be realized, especially when considering architectural specifics of the used GPU hardware.

+ + + +
+ + + + + + + + + + + + + + + +
Kernel NameCountPercentage
vector_add300100.0
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ValueDimension12564096
0block_x0.0300.00.0
1block_y300.00.00.0
2block_z300.00.00.0
3grid_x0.00.0300.0
4grid_y300.00.00.0
5grid_z300.00.00.0
+
+ +
+
+ + Kernel Distribution + +
+
+
+ + +
+

Performance Bottlenecks

+
+
+

To effectively address the performance bottlenecks identified in your CUDA trace data, the analysis should cover explanations, causes, and potential solutions. Below is a detailed examination based on the provided table and context:

+

1. Detailed Explanation of Each Identified Bottleneck and Its Impact on Performance

+

Memory Transfer Overhead

+
    +
  • +

    Explanation: The memory transfer overhead indicates significant time spent moving data between the host and device memory. At 27.27% of operations, this overhead can considerably affect overall performance by lengthening execution times.

    +
  • +
  • +

    Impact: High overhead in data transfer can limit the speedup gained from parallel processing on the GPU. This reduces the potential performance benefits of using CUDA, as time spent moving data can negate the advantages of fast device computation.

    +
  • +
+

Excessive Synchronization

+
    +
  • +

    Explanation: With 60 synchronization operations constituting 9.09% of API operations, excessive synchronization may result in idle GPU cycles due to threads waiting for others to reach certain execution points.

    +
  • +
  • +

    Impact: Over-synchronization can lead to serialization of parallel tasks, underutilization of GPU resources, and increased execution times, diminishing the potency of concurrent execution capabilities of CUDA.

    +
  • +
+

2. Root Cause Analysis for Each Bottleneck

+

Memory Transfer Overhead

+
    +
  • Root Causes:
  • +
  • Use of pageable (unlocked) host memory, which is slower than pinned memory for transfer operations.
  • +
  • Frequent small transfers instead of fewer batched transactions.
  • +
  • Inefficient data management strategies causing frequent data transfers between the host and the GPU.
  • +
+

Excessive Synchronization

+
    +
  • Root Causes:
  • +
  • Over-reliance on synchronization functions like cudaDeviceSynchronize(), resulting in unnecessary wait times.
  • +
  • Lack of parallelism due to improper usage of CUDA streams, leading to sequential execution of tasks that could otherwise be processed concurrently.
  • +
  • Algorithm design that inherently requires high synchronization, limiting performance improvements from using a GPU.
  • +
+

3. Prioritized Recommendations for Addressing Each Bottleneck

+

Memory Transfer Overhead

+
    +
  1. Use Pinned Memory:
  2. +
  3. +

    Convert pageable host memory to pinned memory to increase data transfer rates between host and GPU.

    +
  4. +
  5. +

    Batch Data Transfers:

    +
  6. +
  7. +

    Minimize overhead by combining smaller data transfers into larger batches, reducing the number of transfer operations.

    +
  8. +
  9. +

    Retain Data on GPU:

    +
  10. +
  11. Whenever possible, perform more operations directly on the GPU to minimize round trips of data between host and device.
  12. +
+

Excessive Synchronization

+
    +
  1. Optimize Use of CUDA Streams:
  2. +
  3. +

    Employ multiple CUDA streams to facilitate asynchronous execution of operations, thus reducing dependency on synchronization barriers.

    +
  4. +
  5. +

    Reduce Synchronization Points:

    +
  6. +
  7. +

    Analyze and minimize the use of unnecessary synchronization calls to preserve task parallelism and enhance performance.

    +
  8. +
  9. +

    Algorithm Redesign:

    +
  10. +
  11. Consider revisiting algorithms to better exploit GPU parallelism and minimize inherent dependencies which necessitate synchronization.
  12. +
+

4. Potential Performance Gains from Implementing the Recommendations

+
    +
  • Expected Gains from Reducing Memory Transfer Overhead:
  • +
  • +

    By implementing pinned memory and batching, data transfer times could be reduced by up to 50%, significantly increasing overall program throughput and efficiency.

    +
  • +
  • +

    Expected Gains from Addressing Excessive Synchronization:

    +
  • +
  • Optimizing synchronization could potentially lead to a reduction in GPU idle times by about 30-50%, yielding substantial performance improvements by better utilizing available computational resources.
  • +
+

Implementing these recommendations can lead to more efficient GPU utilization, reducing execution time, and achieving greater performance acceleration from CUDA computing.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + +
BottleneckMetricSeverityRecommendation
Memory Transfer Overhead27.27% of operationsMediumUse pinned memory, batch transfers, or keep data on GPU longer
Excessive Synchronization60 sync operations (9.09%)MediumReduce synchronization points, use multiple streams for parallelism
+
+ +
+
+ + API Call Timeline + +
+
+
+ + +
+

Optimization Recommendations

+

To provide detailed optimization recommendations based on CUDA trace analysis, I will make some assumptions about potential findings from a typical CUDA trace analysis. These assumptions include issues like kernel execution inefficiencies, memory bottlenecks, and underutilization of GPU resources. With these in mind, here are detailed recommendations across code-level optimizations, architectural changes, and leveraging CUDA features, prioritized by expected impact:

+

1. Code-Level Optimizations

+

a. Kernel Execution

+

Issue: Kernel execution time is high due to inefficient code. +- Recommendation: Optimize kernel code by minimizing divergence. For instance, avoid branches within warps where possible. Use predicated execution or warp-synchronous programming techniques. +- Example: Use shared memory effectively by loading data into shared memory once and reusing it across multiple threads within a block. This reduces global memory access latency.

+

b. Memory Access Patterns

+

Issue: Non-coalesced memory accesses leading to increased latency. +- Recommendation: Ensure memory accesses are coalesced by aligning data accesses such that threads within a warp access sequential memory locations. +- Example: If dealing with structures, consider using Structure of Arrays (SoA) instead of Array of Structures (AoS) to ensure coalesced and efficient memory access.

+

c. Instruction Throughput

+

Issue: Low instruction throughput. +- Recommendation: Utilize intrinsic functions specific to CUDA like __sinf, __expf for trigonometric or exponential functions to increase math operation throughput. +- Example: Replace standard math functions in your kernel with their CUDA intrinsic counterparts where precision is acceptable.

+

2. Architectural Changes

+

a. Grid and Block Configuration

+

Issue: Suboptimal grid and block configuration leading to low occupancy. +- Recommendation: Adjust the block size to maximize occupancy. Use CUDA Occupancy Calculator to find optimal block sizes that maximize the number of active warps per multiprocessor. +- Example: If the current block size is not a multiple of the warp size (32), try adjusting it to be a power of two within the constraints of your code.

+

b. Memory Hierarchy Utilization

+

Issue: Underutilization of shared memory and cache. +- Recommendation: Use shared memory to cache repetitive global memory reads. Take advantage of L1 and L2 caches by optimizing data reuse patterns. +- Example: For computational kernels with repeated data access patterns, optimize the data layout to enhance cache locality.

+

3. Alternative Approaches or CUDA Features

+

a. Asynchronous Execution

+

Issue: Sequential execution of memory transfers and kernel executions. +- Recommendation: Leverage CUDA streams to overlap computation with memory transfers. Use cudaMemcpyAsync to perform asynchronous data transfers between host and device. +- Example: Instead of waiting for memory transfer to complete before launching a kernel, use different streams to overlap these operations.

+

b. Unified Memory

+

Issue: Complex data management between host and device. +- Recommendation: Consider using Unified Memory to simplify data management, especially if the application involves complex memory allocation and deallocation patterns. +- Example: Using cudaMallocManaged allows the system to automatically manage memory residency, although this may not provide the best performance in every case.

+

4. Prioritization of Recommendations

+
    +
  1. Memory Access Patterns: Ensuring coalesced access usually provides immediate and significant benefits.
  2. +
  3. Grid and Block Configuration: Properly configuring these can significantly impact occupancy and thus performance.
  4. +
  5. Kernel Execution: Reducing divergence and using efficient math operations can yield noticeable improvements.
  6. +
  7. Asynchronous Execution: Overlapping data transfer and execution increases pipeline efficiency.
  8. +
  9. Unified Memory: Provides ease of use, though hardware limitations might dictate otherwise.
  10. +
+

These recommendations assume the presence of specific issues that are common in CUDA trace analysis. Adjustments may be necessary based on the unique results of your trace data. If you have specific details about your trace findings like kernel names, memory transfer times, occupancy rates, etc., feel free to share them for more tailored advice.

+
+ +
+ Generated on: 2025-04-23 23:49:28 +
+ + + \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh index 70ca973..0aa2a1c 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -101,7 +101,6 @@ build_strobelight() { popd } - get_required_libs() { if [ -x "$(command -v dnf)" ]; then sudo dnf install \ @@ -109,14 +108,21 @@ get_required_libs() { elfutils-libelf \ elfutils-libelf-devel\ fmt-devel - else + elif [ -x "$(command -v yum)" ]; then sudo yum install -y \ git \ elfutils-libelf-devel \ fmt-devel + elif [ -x "$(command -v apt)" ]; then +sudo apt install -y \ + git \ + cmake \ + clang \ + libfmt-dev + else +echo "Package manager not found or not recognized." fi } - update_submodules get_required_libs get_cargo_blazesym diff --git a/strobelight/src/profilers/gpuevent_snoop/GpuEventSnoop.cpp b/strobelight/src/profilers/gpuevent_snoop/GpuEventSnoop.cpp index c7d8cb2..b9c304b 100644 --- a/strobelight/src/profilers/gpuevent_snoop/GpuEventSnoop.cpp +++ b/strobelight/src/profilers/gpuevent_snoop/GpuEventSnoop.cpp @@ -22,7 +22,27 @@ #define MAX_FUNC_DISPLAY_LEN 32 static const int64_t RINGBUF_MAX_ENTRIES = 64 * 1024 * 1024; -static const std::string kCudaLaunchSymName = "cudaLaunchKernel"; + +// static const std::string kCudaLaunchSymName = "cudaLaunchKernel"; + +// List of CUDA symbols to attach uprobes +static const std::vector kCudaSymbols = { + "cudaLaunchKernel", + "cudaLaunchCooperativeKernel", + "cudaGraphLaunch", + "cudaMalloc", + "cudaFree", + "cudaMemcpy", + "cudaMemcpyAsync", + "cudaStreamCreate", + "cudaStreamDestroy", + "cudaStreamSynchronize", + "cudaEventRecord", + "cudaEventSynchronize", + "cudaEventElapsedTime", + "cudaDeviceSynchronize" +}; + using namespace facebook::strobelight::oss; @@ -113,43 +133,73 @@ static int libbpf_print_fn( } static int handle_event(void* ctx, void* data, size_t /*data_sz*/) { - const struct gpukern_sample* e = (struct gpukern_sample*)data; - - SymUtils* symUtils = (SymUtils*)ctx; - - SymbolInfo symInfo = symUtils->getSymbolByAddr(e->kern_func_off, env.args); - - fmt::print( - "{} [{}] KERNEL [0x{:x}] STREAM 0x{:<16x} GRID ({},{},{}) BLOCK ({},{},{}) {}\n", - e->comm, - e->pid, - e->kern_func_off, - e->stream, - e->grid_x, - e->grid_y, - e->grid_z, - e->block_x, - e->block_y, - e->block_z, - symInfo.name.substr(0, MAX_FUNC_DISPLAY_LEN) + - (symInfo.name.length() > MAX_FUNC_DISPLAY_LEN ? "..." : "")); - if (env.args) { - fmt::print("Args: "); - for (size_t i = 0; i < symInfo.args.size() && i < MAX_GPUKERN_ARGS; i++) { - fmt::print("{} arg{}=0x{:x}\n ", symInfo.args[i], i, e->args[i]); + const struct gpukern_sample* e = (struct gpukern_sample*)data; + SymUtils* symUtils = (SymUtils*)ctx; + + // Get function symbol information + SymbolInfo symInfo = symUtils->getSymbolByAddr(e->kern_func_off, env.args); + + // Timestamp + auto timestamp = std::chrono::system_clock::now(); + std::time_t timestamp_t = std::chrono::system_clock::to_time_t(timestamp); + + // Process info + fmt::print("[TIMESTAMP] {}\n", std::ctime(×tamp_t)); + fmt::print("[PROCESS] {} [{}] CUDA API EventType {}\n", e->comm, e->pid, e->event_type); + + // Print event-specific information + switch (e->event_type) { + case EVENT_CUDA_LAUNCH_KERNEL: + fmt::print("[CUDA_LAUNCH_KERNEL] Grid: ({},{},{}), Block: ({},{},{})\n", + e->grid_x, e->grid_y, e->grid_z, e->block_x, e->block_y, e->block_z); + break; + case EVENT_CUDA_MALLOC: + fmt::print("[CUDA_MALLOC] Size: {} bytes, Ptr: 0x{:x}\n", e->args[0], e->args[1]); + break; + case EVENT_CUDA_FREE: + fmt::print("[CUDA_FREE] Ptr: 0x{:x}\n", e->args[0]); + break; + case EVENT_CUDA_MEMCPY: + fmt::print("[CUDA_MEMCPY] Src: 0x{:x}, Dst: 0x{:x}, Size: {} bytes, Kind: {}\n", + e->args[1], e->args[0], e->args[2], e->args[3]); + break; + case EVENT_CUDA_STREAM_CREATE: + fmt::print("[CUDA_STREAM_CREATE] Stream: 0x{:x}\n", e->args[0]); + break; + case EVENT_CUDA_STREAM_DESTROY: + fmt::print("[CUDA_STREAM_DESTROY] Stream: 0x{:x}\n", e->args[0]); + break; + case EVENT_CUDA_EVENT_RECORD: + fmt::print("[CUDA_EVENT_RECORD] Event: 0x{:x}\n", e->args[0]); + break; + case EVENT_CUDA_EVENT_SYNCHRONIZE: + fmt::print("[CUDA_EVENT_SYNCHRONIZE] Event: 0x{:x}\n", e->args[0]); + break; + default: + fmt::print("[UNKNOWN_CUDA_EVENT]\n"); + break; } - fmt::print("\n"); - } - if (env.stacks) { - fmt::print("Stack: \n"); - auto stack = symUtils->getStackByAddrs((uint64_t*)e->ustack, e->ustack_sz); - for (auto& frame : stack) { - frame.print(); + // Print function arguments if requested + if (env.args) { + fmt::print("[ARGS] "); + for (size_t i = 0; i < symInfo.args.size() && i < MAX_GPUKERN_ARGS; i++) { + fmt::print("{} arg{}=0x{:x} ", symInfo.args[i], i, e->args[i]); + } + fmt::print("\n"); } - } - fmt::print("{:-<40}\n", '-'); - return 0; + + // Print stack trace if requested + if (env.stacks) { + fmt::print("[STACK_TRACE]\n"); + auto stack = symUtils->getStackByAddrs((uint64_t*)e->ustack, e->ustack_sz); + for (auto& frame : stack) { + fmt::print(" {}\n", frame.name); + } + } + + fmt::print("{:-<80}\n", '-'); // Separator + return 0; } bool hasExceededProfilingLimit( @@ -216,22 +266,23 @@ int main(int argc, char* argv[]) { ring_buffer__free(ringBuffer); }); - auto offsets = symUtils.findSymbolOffsets(kCudaLaunchSymName); - if (offsets.empty()) { - fmt::print(stderr, "Failed to find symbol {}\n", kCudaLaunchSymName); - return -1; - } - for (auto& offset : offsets) { - auto link = bpf_program__attach_uprobe( - skel->progs.handle_cuda_launch, - false /* retprobe */, - env.pid, - offset.first.c_str(), - offset.second); - if (link) { - links.emplace_back(link); + /* Attach Uprobes for CUDA API tracepoints */ + for (const auto& symbol : kCudaSymbols) { + auto offsets = symUtils.findSymbolOffsets(symbol); + if (offsets.empty()) { + fmt::print(stderr, "Failed to find symbol {}\n", symbol); + continue; + } + for (const auto& offset : offsets) { + auto link = bpf_program__attach_uprobe( + skel->progs.handle_cuda_launch, false, env.pid, + offset.first.c_str(), offset.second); + if (link) { + links.emplace_back(link); + } } } + /* Set up ring buffer polling */ ringBuffer = ring_buffer__new( bpf_map__fd(skel->maps.rb), handle_event, (void*)&symUtils, nullptr); diff --git a/strobelight/src/profilers/gpuevent_snoop/bpf/CUDA_API_LIST b/strobelight/src/profilers/gpuevent_snoop/bpf/CUDA_API_LIST new file mode 100644 index 0000000..e006203 --- /dev/null +++ b/strobelight/src/profilers/gpuevent_snoop/bpf/CUDA_API_LIST @@ -0,0 +1,31 @@ +Memory Management APIs: +cudaMalloc +cudaFree +cudaMemcpy +cudaMemcpyAsync + +Stream and Event Management APIs: +cudaStreamCreate +cudaStreamDestroy +cudaStreamSynchronize + +Kernel Execution APIs: +cudaGraphLaunch +cudaLaunchCooperativeKernel +cudaLaunchHostFunc + +Synchronization APIs: +cudaDeviceSynchronize +cudaEventRecord +cudaEventSynchronize +cudaEventElapsedTime + +----- +Tracepoint Needs Special Handling? Why? +cudaLaunchKernel βœ… Yes Needs grid_x, grid_y, grid_z, block_x, block_y, block_z, stream, argv parsing, and stack trace. +cudaMalloc ❌ No Just captures size and pointer; no need for grid/block dimensions or stack. +cudaFree ❌ No Only captures pointer being freed. +cudaMemcpy ❌ No Only captures source, destination, size, and kind. +cudaStreamCreate ❌ No Only captures stream pointer. +cudaDeviceSynchronize ❌ No No arguments to capture. + diff --git a/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.bpf.c b/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.bpf.c index a91f310..8938dec 100644 --- a/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.bpf.c +++ b/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.bpf.c @@ -1,7 +1,5 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// -// This source code is licensed under the MIT license found in the -// LICENSE file in the root directory of this source tree. +// Copyright (c) Meta Platforms, Inc. +// Licensed under the MIT License. #ifdef FBCODE_STROBELIGHT #include @@ -24,7 +22,6 @@ const volatile struct { bool capture_args; bool capture_stack; } prog_cfg = { - // These defaults will be overridden from user space .debug = true, .capture_args = true, .capture_stack = true, @@ -36,12 +33,9 @@ const volatile struct { bpf_printk(fmt, ##__VA_ARGS__); \ }) -// The caller uses registers to pass the first 6 arguments to the callee. Given -// the arguments in left-to-right order, the order of registers used is: %rdi, -// %rsi, %rdx, %rcx, %r8, and %r9. Any remaining arguments are passed on the -// stack in reverse order so that they can be popped off the stack in order. #define SP_OFFSET(offset) (void*)PT_REGS_SP(ctx) + offset * 8 +// CUDA Kernel Launch Tracepoint SEC("uprobe") int BPF_KPROBE( handle_cuda_launch, @@ -51,51 +45,143 @@ int BPF_KPROBE( u64 block_xy, u64 block_z, uintptr_t argv) { - struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (!e) { - bpf_printk_debug("Failed to allocate ringbuf entry"); - return 0; - } - - struct task_struct* task = (struct task_struct*)bpf_get_current_task(); - - e->pid = bpf_get_current_pid_tgid() >> 32; - e->ppid = BPF_CORE_READ(task, real_parent, tgid); - bpf_get_current_comm(&e->comm, sizeof(e->comm)); - - e->kern_func_off = func_off; - e->grid_x = (u32)grid_xy; - e->grid_y = (u32)(grid_xy >> 32); - e->grid_z = (u32)grid_z; - e->block_x = (u32)block_xy; - e->block_y = (u32)(block_xy >> 32); - e->block_z = (u32)block_z; - - bpf_probe_read_user(&e->stream, sizeof(uintptr_t), SP_OFFSET(2)); - - if (prog_cfg.capture_args) { - // Read the Cuda Kernel Launch Arguments - for (int i = 0; i < MAX_GPUKERN_ARGS; i++) { - const void* arg_addr; - // We don't know how many argument this kernel has until we parse the - // signature, so we always attemps to read the maximum number of args, - // even if some of these arg values are not valid. - bpf_probe_read_user( - &arg_addr, sizeof(u64), (const void*)(argv + i * sizeof(u64))); - - bpf_probe_read_user(&e->args[i], sizeof(arg_addr), arg_addr); + + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + e->pid = bpf_get_current_pid_tgid() >> 32; + e->ppid = BPF_CORE_READ(task, real_parent, tgid); + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_LAUNCH_KERNEL; + e->kern_func_off = func_off; + e->grid_x = (u32)grid_xy; + e->grid_y = (u32)(grid_xy >> 32); + e->grid_z = (u32)grid_z; + e->block_x = (u32)block_xy; + e->block_y = (u32)(block_xy >> 32); + e->block_z = (u32)block_z; + + bpf_probe_read_user(&e->stream, sizeof(uintptr_t), SP_OFFSET(2)); + + if (prog_cfg.capture_args) { + for (int i = 0; i < MAX_GPUKERN_ARGS; i++) { + const void* arg_addr; + bpf_probe_read_user(&arg_addr, sizeof(u64), (const void*)(argv + i * sizeof(u64))); + bpf_probe_read_user(&e->args[i], sizeof(arg_addr), arg_addr); + } + } + + if (prog_cfg.capture_stack) { + e->ustack_sz = bpf_get_stack(ctx, e->ustack, sizeof(e->ustack), BPF_F_USER_STACK) / + sizeof(uint64_t); } - } - if (prog_cfg.capture_stack) { - // Read the Cuda Kernel Launch Stack - e->ustack_sz = - bpf_get_stack(ctx, e->ustack, sizeof(e->ustack), BPF_F_USER_STACK) / - sizeof(uint64_t); - } + bpf_ringbuf_submit(e, 0); + return 0; +} + +// CUDA Memory Management Tracepoints +SEC("uprobe") +int BPF_KPROBE(handle_cuda_malloc, size_t size, void **devPtr) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_MALLOC; + e->kern_func_off = (uint64_t)PT_REGS_IP(ctx); + e->args[0] = size; + bpf_probe_read_user(&e->args[1], sizeof(void *), devPtr); + + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(handle_cuda_free, void *devPtr) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_FREE; + e->kern_func_off = (uint64_t)PT_REGS_IP(ctx); + e->args[0] = (uint64_t)devPtr; - bpf_ringbuf_submit(e, 0); - return 0; + bpf_ringbuf_submit(e, 0); + return 0; +} + +// CUDA Memory Copy Tracepoints +SEC("uprobe") +int BPF_KPROBE(handle_cuda_memcpy, void *dst, const void *src, size_t count, int kind) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_MEMCPY; + e->kern_func_off = (uint64_t)PT_REGS_IP(ctx); + e->args[0] = (uint64_t)dst; + e->args[1] = (uint64_t)src; + e->args[2] = count; + e->args[3] = kind; + + bpf_ringbuf_submit(e, 0); + return 0; +} + +// CUDA Stream and Event Management Tracepoints +SEC("uprobe") +int BPF_KPROBE(handle_cuda_stream_create, void **stream) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_STREAM_CREATE; + bpf_probe_read_user(&e->args[0], sizeof(void *), stream); + + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(handle_cuda_stream_destroy, void *stream) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_STREAM_DESTROY; + e->args[0] = (uint64_t)stream; + + bpf_ringbuf_submit(e, 0); + return 0; +} + +// CUDA Synchronization Tracepoints +SEC("uprobe") +int BPF_KPROBE(handle_cuda_device_synchronize) { + struct gpukern_sample* e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) return 0; + + e->pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + e->event_type = EVENT_CUDA_DEVICE_SYNCHRONIZE; + e->kern_func_off = (uint64_t)PT_REGS_IP(ctx); + + bpf_ringbuf_submit(e, 0); + return 0; } char LICENSE[] SEC("license") = "Dual MIT/GPL"; + diff --git a/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.h b/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.h index df9ce03..337d796 100644 --- a/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.h +++ b/strobelight/src/profilers/gpuevent_snoop/bpf/gpuevent_snoop.h @@ -11,8 +11,45 @@ #define MAX_STACK_DEPTH 128 #endif +enum cuda_event_type { + EVENT_CUDA_LAUNCH_KERNEL, + EVENT_CUDA_MALLOC, + EVENT_CUDA_FREE, + EVENT_CUDA_MEMCPY, + EVENT_CUDA_MEMCPY_ASYNC, + EVENT_CUDA_STREAM_CREATE, + EVENT_CUDA_STREAM_DESTROY, + EVENT_CUDA_STREAM_SYNCHRONIZE, + EVENT_CUDA_EVENT_RECORD, + EVENT_CUDA_EVENT_SYNCHRONIZE, + EVENT_CUDA_EVENT_ELAPSED_TIME, + EVENT_CUDA_DEVICE_SYNCHRONIZE +}; + typedef uint64_t stack_trace_t[MAX_STACK_DEPTH]; +struct gpukern_sample { + int pid, ppid; + char comm[TASK_COMM_LEN]; + uint64_t kern_func_off; // Address of the CUDA API call + + // Fields specific to kernel execution tracking + int grid_x, grid_y, grid_z; + int block_x, block_y, block_z; + uint64_t stream; + + // Generalized args to support multiple CUDA APIs + uint64_t args[MAX_GPUKERN_ARGS]; + + // Stack trace information + size_t ustack_sz; + stack_trace_t ustack; + + // New: Event type to distinguish between different CUDA API calls + uint32_t event_type; +}; + +/*** original*** struct gpukern_sample { int pid, ppid; char comm[TASK_COMM_LEN]; @@ -24,3 +61,4 @@ struct gpukern_sample { size_t ustack_sz; stack_trace_t ustack; }; +*/