From afa43ff30b61fcb6b159b4bdfb27a12971a0d748 Mon Sep 17 00:00:00 2001
From: Ben Richard <143630488+benrichard-amd@users.noreply.github.com>
Date: Mon, 9 Sep 2024 11:38:19 -0400
Subject: [PATCH] Cherry-pick [Reduce number of calls to rocprof #384] into
 release/rocm-rel-6.2 (#422)

* Improve perfmon coalescing

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Interleve TCC channel counters

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Remove duplicate normal counters
Interleve TCC channel counters in putput file  e.g.  TCC_HIT[0] TCC_ATOMIC[0] ... TCC_HIT[1] TCC_ATOMIC[1]

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Save accumulate counters to SQ_ files

Omniperf analyze expects the accumulate files to be in SQ_*.csv files.

Since these files also contain PMC counters (we are trying to
fit as many counters into each file as possible to minimize runs),
we need to include these SQ_*.csv files in pmc_perf.csv.

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Update to work with rocprof v1

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Remove unused method

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Set correct number of TCC channels for gfx942

Ran into rocprof error:
ROCProfiler: fatal error: input metric'TCC_EA0_RDREQ[16]' not supported on this hardware: gfx942

gfx942 has 16 channels, not 32.

Signed-off-by: benrichard-amd <ben.richard@amd.com>

* Fix code formatting

Signed-off-by: benrichard-amd <ben.richard@amd.com>

---------

Signed-off-by: benrichard-amd <ben.richard@amd.com>
---
 src/omniperf_profile/profiler_base.py       |  42 +--
 src/omniperf_profile/profiler_rocprof_v1.py |   2 -
 src/omniperf_profile/profiler_rocprof_v2.py |   2 -
 src/omniperf_soc/soc_base.py                | 367 ++++++++------------
 src/omniperf_soc/soc_gfx942.py              |   2 +-
 5 files changed, 152 insertions(+), 263 deletions(-)

diff --git a/src/omniperf_profile/profiler_base.py b/src/omniperf_profile/profiler_base.py
index 4288452d8..3d1ad552d 100644
--- a/src/omniperf_profile/profiler_base.py
+++ b/src/omniperf_profile/profiler_base.py
@@ -62,43 +62,6 @@ def get_profiler_options(self, fname):
         # assume no SoC specific options and return empty list by default
         return []
 
-    @demarcate
-    def pmc_perf_split(self):
-        """Avoid default rocprof join utility by spliting each line into a separate input file"""
-        workload_perfmon_dir = os.path.join(self.__args.path, "perfmon")
-        lines = (
-            open(os.path.join(workload_perfmon_dir, "pmc_perf.txt"), "r")
-            .read()
-            .splitlines()
-        )
-
-        # Iterate over each line in pmc_perf.txt
-        mpattern = r"^pmc:(.*)"
-        i = 0
-        for line in lines:
-            # Verify no comments
-            stext = line.split("#")[0].strip()
-            if not stext:
-                continue
-
-            # all pmc counters start with  "pmc:"
-            m = re.match(mpattern, stext)
-            if m is None:
-                continue
-
-            # Create separate file for each line
-            fd = open(workload_perfmon_dir + "/pmc_perf_" + str(i) + ".txt", "w")
-            fd.write(stext + "\n\n")
-            fd.write("gpu:\n")
-            fd.write("range:\n")
-            fd.write("kernel:\n")
-            fd.close()
-
-            i += 1
-
-        # Remove old pmc_perf.txt input from perfmon dir
-        os.remove(workload_perfmon_dir + "/pmc_perf.txt")
-
     @demarcate
     def join_prof(self, out=None):
         """Manually join separated rocprof runs"""
@@ -107,6 +70,7 @@ def join_prof(self, out=None):
             if out is None:
                 out = self.__args.path + "/pmc_perf.csv"
             files = glob.glob(self.__args.path + "/" + "pmc_perf_*.csv")
+            files.extend(glob.glob(self.__args.path + "/" + "SQ_*.csv"))
         elif type(self.__args.path) == list:
             files = self.__args.path
         else:
@@ -261,7 +225,9 @@ def join_prof(self, out=None):
             df.to_csv(out, index=False)
             if not self.__args.verbose:
                 for file in files:
-                    os.remove(file)
+                    # Do not remove accumulate counter files
+                    if "SQ_" not in file:
+                        os.remove(file)
         else:
             return df
 
diff --git a/src/omniperf_profile/profiler_rocprof_v1.py b/src/omniperf_profile/profiler_rocprof_v1.py
index 887fcff00..17cf18ba1 100644
--- a/src/omniperf_profile/profiler_rocprof_v1.py
+++ b/src/omniperf_profile/profiler_rocprof_v1.py
@@ -59,8 +59,6 @@ def get_profiler_options(self, fname):
     def pre_processing(self):
         """Perform any pre-processing steps prior to profiling."""
         super().pre_processing()
-        if self.ready_to_profile:
-            self.pmc_perf_split()
 
     @demarcate
     def run_profiling(self, version: str, prog: str):
diff --git a/src/omniperf_profile/profiler_rocprof_v2.py b/src/omniperf_profile/profiler_rocprof_v2.py
index cbe69242a..1fdaa1158 100644
--- a/src/omniperf_profile/profiler_rocprof_v2.py
+++ b/src/omniperf_profile/profiler_rocprof_v2.py
@@ -63,8 +63,6 @@ def get_profiler_options(self, fname):
     def pre_processing(self):
         """Perform any pre-processing steps prior to profiling."""
         super().pre_processing()
-        if self.ready_to_profile:
-            self.pmc_perf_split()
 
     @demarcate
     def run_profiling(self, version, prog):
diff --git a/src/omniperf_soc/soc_base.py b/src/omniperf_soc/soc_base.py
index 76e798439..446c6a82a 100644
--- a/src/omniperf_soc/soc_base.py
+++ b/src/omniperf_soc/soc_base.py
@@ -31,6 +31,7 @@
 import numpy as np
 from utils.utils import demarcate, console_debug, console_log, console_error
 from pathlib import Path
+from collections import OrderedDict
 
 from omniperf_base import SUPPORTED_ARCHS
 
@@ -256,10 +257,7 @@ def perfmon_filter(self, roofline_perfmon_only: bool):
             pmc_files_list = ref_pmc_files_list
 
         # Coalesce and writeback workload specific perfmon
-        pmc_list = perfmon_coalesce(
-            pmc_files_list, self.__perfmon_config, self.__workload_dir
-        )
-        perfmon_emit(pmc_list, self.__perfmon_config, self.__workload_dir)
+        perfmon_coalesce(pmc_files_list, self.__perfmon_config, self.__workload_dir)
 
     # ----------------------------------------------------
     # Required methods to be implemented by child classes
@@ -280,245 +278,174 @@ def analysis_setup(self):
         console_debug("analysis", "perform SoC analysis setup for %s" % self.__arch)
 
 
+def getblock(counter):
+    return counter.split("_")[0]
+
+
+# Set with limited size
+class LimitedSet:
+    def __init__(self, maxsize) -> None:
+        self.avail = maxsize
+        self.elements = []
+
+    def add(self, e) -> None:
+        if e in self.elements:
+            return True
+        elif self.avail <= 0:
+            return False
+
+        self.avail -= 1
+        self.elements.append(e)
+
+        return True
+
+
+# Represents a file that lists PMC counters. Number of counters for each
+# block limited according to perfmon config.
+class CounterFile:
+    def __init__(self, name, perfmon_config) -> None:
+        self.file_name = name
+        self.blocks = {b: LimitedSet(v) for b, v in perfmon_config.items()}
+
+    def add(self, counter) -> bool:
+        block = getblock(counter)
+
+        # SQ and SQC belong to the same IP block
+        if block == "SQC":
+            block = "SQ"
+
+        return self.blocks[block].add(counter)
+
+
 @demarcate
 def perfmon_coalesce(pmc_files_list, perfmon_config, workload_dir):
     """Sort and bucket all related performance counters to minimize required application passes"""
     workload_perfmon_dir = workload_dir + "/perfmon"
 
-    # match pattern for pmc counters
-    mpattern = r"^pmc:(.*)"
-    pmc_list = dict(
-        [
-            ("SQ", []),
-            ("GRBM", []),
-            ("TCP", []),
-            ("TA", []),
-            ("TD", []),
-            ("TCC", []),
-            ("SPI", []),
-            ("CPC", []),
-            ("CPF", []),
-            ("GDS", []),
-            ("TCC2", {}),  # per-channel TCC perfmon
-        ]
-    )
-    for ch in range(perfmon_config["TCC_channels"]):
-        pmc_list["TCC2"][str(ch)] = []
-
-    # Extract all PMC counters and store in separate buckets
+    # Will be 2D array
+    accumulate_counters = []
+
+    normal_counters = OrderedDict()
+
     for fname in pmc_files_list:
+
         lines = open(fname, "r").read().splitlines()
 
         for line in lines:
-            # Strip all comements, skip empty lines
+
+            # Strip all comments, skip empty lines
             stext = line.split("#")[0].strip()
             if not stext:
                 continue
 
             # all pmc counters start with  "pmc:"
-            m = re.match(mpattern, stext)
+            m = re.match(r"^pmc:(.*)", stext)
             if m is None:
                 continue
 
-            # we have found all the counters, store them in buckets
             counters = m.group(1).split()
 
-            # Utilitze helper function once a list of counters has be extracted
-            save_file = True
-            pmc_list = update_pmc_bucket(
-                counters, save_file, perfmon_config, pmc_list, stext, workload_perfmon_dir
-            )
-
-    # add a timestamp file
-    fd = open(workload_perfmon_dir + "/timestamps.txt", "w")
-    fd.write("pmc:\n\n")
-    fd.write("gpu:\n")
-    fd.write("range:\n")
-    fd.write("kernel:\n")
-    fd.close()
-
-    # sort the per channel counter, so that same counter in all channels can be aligned
-    for ch in range(perfmon_config["TCC_channels"]):
-        pmc_list["TCC2"][str(ch)].sort()
-
-    return pmc_list
-
-
-@demarcate
-def update_pmc_bucket(
-    counters,
-    save_file,
-    perfmon_config,
-    pmc_list=None,
-    stext=None,
-    workload_perfmon_dir=None,
-):
-    # Verify inputs.
-    # If save_file is True, we're being called internally, from perfmon_coalesce
-    # Else we're being called externally, from rocomni
-    detected_external_call = False
-    if save_file and (stext is None or workload_perfmon_dir is None):
-        raise ValueError(
-            "stext and workload_perfmon_dir must be specified if save_file is True"
-        )
-    if pmc_list is None:
-        detected_external_call = True
-        pmc_list = dict(
-            [
-                ("SQ", []),
-                ("GRBM", []),
-                ("TCP", []),
-                ("TA", []),
-                ("TD", []),
-                ("TCC", []),
-                ("SPI", []),
-                ("CPC", []),
-                ("CPF", []),
-                ("GDS", []),
-                ("TCC2", {}),  # per-channel TCC perfmon
-            ]
-        )
-        for ch in range(perfmon_config["TCC_channels"]):
-            pmc_list["TCC2"][str(ch)] = []
-
-    if "SQ_ACCUM_PREV_HIRES" in counters and not detected_external_call:
-        # save  all level counters separately
-        nindex = counters.index("SQ_ACCUM_PREV_HIRES")
-        level_counter = counters[nindex - 1]
-
-        if save_file:
-            # Save to level counter file, file name = level counter name
-            fd = open(workload_perfmon_dir + "/" + level_counter + ".txt", "w")
-            fd.write(stext + "\n\n")
-            fd.write("gpu:\n")
-            fd.write("range:\n")
-            fd.write("kernel:\n")
-            fd.close()
-
-        return pmc_list
-
-    # save normal pmc counters in matching buckets
-    for counter in counters:
-        IP_block = counter.split(sep="_")[0].upper()
-        # SQC and SQ belong to the IP block, coalesce them
-        if IP_block == "SQC":
-            IP_block = "SQ"
-
-        if IP_block != "TCC":
-            # Insert unique pmc counters into its bucket
-            if counter not in pmc_list[IP_block]:
-                pmc_list[IP_block].append(counter)
-
-        else:
-            # TCC counters processing
-            m = re.match(r"[\s\S]+\[(\d+)\]", counter)
-            if m is None:
-                # Aggregated TCC counters
-                if counter not in pmc_list[IP_block]:
-                    pmc_list[IP_block].append(counter)
-
+            if "SQ_ACCUM_PREV_HIRES" in counters:
+                # Accumulate counters
+                accumulate_counters.append(counters.copy())
             else:
-                # TCC channel ID
-                ch = m.group(1)
-
-                # fake IP block for per channel TCC
-                if str(ch) in pmc_list["TCC2"]:
-                    # append unique counter into the channel
-                    if counter not in pmc_list["TCC2"][str(ch)]:
-                        pmc_list["TCC2"][str(ch)].append(counter)
-                else:
-                    # initial counter in this channel
-                    pmc_list["TCC2"][str(ch)] = [counter]
-
-    if detected_external_call:
-        # sort the per channel counter, so that same counter in all channels can be aligned
-        for ch in range(perfmon_config["TCC_channels"]):
-            pmc_list["TCC2"][str(ch)].sort()
-    return pmc_list
+                # Normal counters
+                for ctr in counters:
+
+                    # Channel counter e.g. TCC_ATOMIC[0]
+                    if "[" in ctr:
+
+                        # Remove channel number, append "_expand" so we know
+                        # add the channel numbers back later
+                        channel = int(ctr.split("[")[1].split("]")[0])
+                        if channel == 0:
+                            counter_name = ctr.split("[")[0] + "_expand"
+                            try:
+                                normal_counters[counter_name] += 1
+                            except:
+                                normal_counters[counter_name] = 1
+                    else:
+                        try:
+                            normal_counters[ctr] += 1
+                        except:
+                            normal_counters[ctr] = 1
+
+    # De-duplicate. Remove accumulate counters from normal counters
+    for accus in accumulate_counters:
+        for accu in accus:
+            if accu in normal_counters:
+                del normal_counters[accu]
+
+    output_files = []
+
+    # Each accumulate counter is in a different file
+    for ctrs in accumulate_counters:
+
+        # Get name of the counter and use it as file name
+        ctr_name = ctrs[ctrs.index("SQ_ACCUM_PREV_HIRES") - 1]
+        output_files.append(CounterFile(ctr_name + ".txt", perfmon_config))
+        for ctr in ctrs:
+            output_files[-1].add(ctr)
+
+    file_count = 0
+    for ctr in normal_counters.keys():
+
+        # Add counter to first file that has room
+        added = False
+        for f in output_files:
+            if f.add(ctr):
+                added = True
+                break
 
+        # All files are full, create a new file
+        if not added:
+            output_files.append(
+                CounterFile("pmc_perf_{}.txt".format(file_count), perfmon_config)
+            )
+            file_count += 1
+            output_files[-1].add(ctr)
+
+    # Output to files
+    for f in output_files:
+        file_name = os.path.join(workload_perfmon_dir, f.file_name)
+
+        pmc = []
+        for block_name in f.blocks.keys():
+            if block_name == "TCC":
+
+                # Expand and interleve the TCC channel counters
+                # e.g.  TCC_HIT[0] TCC_ATOMIC[0] ... TCC_HIT[1] TCC_ATOMIC[1] ...
+                channel_counters = []
+                for ctr in f.blocks[block_name].elements:
+                    if "_expand" in ctr:
+                        channel_counters.append(ctr.split("_expand")[0])
+
+                for i in range(0, perfmon_config["TCC_channels"]):
+                    for c in channel_counters:
+                        pmc.append("{}[{}]".format(c, i))
+
+                # Handle the rest of the TCC counters
+                for ctr in f.blocks[block_name].elements:
+                    if "_expand" not in ctr:
+                        pmc.append(ctr)
+            else:
+                for ctr in f.blocks[block_name].elements:
+                    pmc.append(ctr)
 
-@demarcate
-def perfmon_emit(pmc_list, perfmon_config, workload_dir=None):
-    # Calculate the minimum number of iteration to save the pmc counters
-    # non-TCC counters
-    pmc_cnt = [
-        len(pmc_list[key]) / perfmon_config[key]
-        for key in pmc_list
-        if key not in ["TCC", "TCC2"]
-    ]
-
-    # TCC counters
-    tcc_channels = perfmon_config["TCC_channels"]
-
-    tcc_cnt = len(pmc_list["TCC"]) / perfmon_config["TCC"]
-    tcc2_cnt = (
-        np.array([len(pmc_list["TCC2"][str(ch)]) for ch in range(tcc_channels)])
-        / perfmon_config["TCC"]
-    )
-
-    # Total number iterations to write pmc: counters line, except TCC2
-    niter = max(math.ceil(max(pmc_cnt)), math.ceil(tcc_cnt))
-
-    # Emit PMC counters into pmc config file
-    if workload_dir:
-        workload_perfmon_dir = workload_dir + "/perfmon"
-        fd = open(workload_perfmon_dir + "/pmc_perf.txt", "w")
-    else:
-        batches = []
-
-    for iter in range(niter):
-        # Prefix
-        line = "pmc: "
-
-        # Add all non-TCC counters
-        for key in pmc_list:
-            if key not in ["TCC", "TCC2"]:
-                N = perfmon_config[key]
-                ip_counters = pmc_list[key][iter * N : iter * N + N]
-                if ip_counters:
-                    line = line + " " + " ".join(ip_counters)
-
-        # Add TCC counters
-        N = perfmon_config["TCC"]
-        tcc_counters = pmc_list["TCC"][iter * N : iter * N + N]
-
-        # TCC aggregated counters
-        line = line + " " + " ".join(tcc_counters)
-        if workload_dir:
-            fd.write(line + "\n")
-        else:
-            b = line.split()
-            b.remove("pmc:")
-            batches.append(b)
-
-    # TCC2, handle TCC per channel counters separatly
-    tcc2_index = 0
-    niter = math.ceil(max(tcc2_cnt))
-    for iter in range(niter):
-        # Prefix
-        line = "pmc: "
-
-        N = perfmon_config["TCC"]
-        # TCC per-channel counters
-        tcc_counters = []
-        for ch in range(perfmon_config["TCC_channels"]):
-            tcc_counters += pmc_list["TCC2"][str(ch)][tcc2_index * N : tcc2_index * N + N]
-
-        tcc2_index += 1
-
-        # TCC2 aggregated counters
-        line = line + " " + " ".join(tcc_counters)
-        if workload_dir:
-            fd.write(line + "\n")
-        else:
-            b = line.split()
-            b.remove("pmc:")
-            batches.append(b)
+        stext = "pmc: " + " ".join(pmc)
 
-    if workload_dir:
-        fd.write("\ngpu:\n")
+        # Write counters to file
+        fd = open(file_name, "w")
+        fd.write(stext + "\n\n")
+        fd.write("gpu:\n")
         fd.write("range:\n")
         fd.write("kernel:\n")
         fd.close()
-    else:
-        return batches
+
+    # Add a timestamp file
+    fd = open(os.path.join(workload_perfmon_dir, "timestamps.txt"), "w")
+    fd.write("pmc:\n\n")
+    fd.write("gpu:\n")
+    fd.write("range:\n")
+    fd.write("kernel:\n")
+    fd.close()
diff --git a/src/omniperf_soc/soc_gfx942.py b/src/omniperf_soc/soc_gfx942.py
index c18b38585..d2ad35eb1 100644
--- a/src/omniperf_soc/soc_gfx942.py
+++ b/src/omniperf_soc/soc_gfx942.py
@@ -64,7 +64,7 @@ def __init__(self, args, mspec):
                 "SPI": 2,
                 "GRBM": 2,
                 "GDS": 4,
-                "TCC_channels": 32,
+                "TCC_channels": 16,
             }
         )
         # self.roofline_obj = Roofline(args, self._mspec)