diff --git a/.gitignore b/.gitignore index d3c0162..74cfd33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +sample/tensorflow_unet3d_darshan_per_rank_workload + # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,pycharm,visualstudiocode # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,pycharm,visualstudiocode diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/MypyPlugin.xml b/.idea/MypyPlugin.xml new file mode 100644 index 0000000..ac4cd76 --- /dev/null +++ b/.idea/MypyPlugin.xml @@ -0,0 +1,8 @@ + + + + + \ No newline at end of file diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml new file mode 100644 index 0000000..883789c --- /dev/null +++ b/.idea/drishti-io.iml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..90404e0 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c4fcf4c --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/Sample_1.xml b/.idea/runConfigurations/Sample_1.xml new file mode 100644 index 0000000..0bc3377 --- /dev/null +++ b/.idea/runConfigurations/Sample_1.xml @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/Sample_2.xml b/.idea/runConfigurations/Sample_2.xml new file mode 100644 index 0000000..3c03139 --- /dev/null +++ b/.idea/runConfigurations/Sample_2.xml @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..cc1923a --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.8 diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py new file mode 100755 index 0000000..43d9cd3 --- /dev/null +++ b/drishti/handlers/darshan_util.py @@ -0,0 +1,1094 @@ +import csv +import datetime +import io +import subprocess +import sys +import typing +from dataclasses import dataclass, field +from enum import Enum +from functools import cached_property +from os import write +from shlex import shlex +from typing import Dict, Final, Optional, Union, List, Tuple, Iterable + +import numpy as np +import pandas as pd +from darshan import DarshanReport # type: ignore +import drishti.includes.parser as parser +import drishti.includes.config as config + + +class ModuleType(str, Enum): + """Enum for standard I/O module types""" + + POSIX = "POSIX" + STDIO = "STDIO" + MPIIO = "MPI-IO" + + def __str__(self) -> str: + return self.value + + +@dataclass +class TimeSpan: + start: datetime.datetime + end: datetime.datetime + + def __post_init__(self): + if self.start > self.end: + raise ValueError( + f"TimeSpan start ({self.start}) must be <= end ({self.end})" + ) + + +@dataclass +class IOCounter: + """Base class for I/O metrics with read/write counts""" + + read: Final[int] = field(init=True) + write: Final[int] = field(init=True) + _total: Optional[int] = None + + @cached_property + def total(self) -> int: + """Total count, calculated once on first access""" + if self._total is not None: + return self._total + return self.read + self.write + + +@dataclass +class IOSize(IOCounter): + """Represents I/O size statistics in bytes""" + + pass + + +@dataclass +class IOOperation(IOCounter): + """Represents I/O operation count statistics""" + + pass + + +@dataclass +class IOStatistics: + """Tracks both I/O sizes and operations by module with aggregated metrics""" + + # Use dicts to store module-specific data + sizes: Dict[ModuleType, IOSize] = field(init=True) + operations: Dict[ModuleType, IOOperation] = field(init=True) + + def __post_init__(self): + # Initialize standard modules if not present + for module in ModuleType: + # Ensure that the module is either in both sizes and operations or in neither + assert (module in self.sizes) == (module in self.operations), ( + f"Module {module} should be in both sizes and operations or in neither" + ) + + if module not in self.sizes: + self.sizes[module] = IOSize(read=0, write=0) + if module not in self.operations: + self.operations[module] = IOOperation(read=0, write=0) + + # Convenience properties for standard modules + @cached_property + def posix_size(self) -> int: + return self.sizes[ModuleType.POSIX].total + + @cached_property + def stdio_size(self) -> int: + return self.sizes[ModuleType.STDIO].total + + @cached_property + def mpiio_size(self) -> int: + return self.sizes[ModuleType.MPIIO].total + + @cached_property + def posix_ops(self) -> int: + return self.operations[ModuleType.POSIX].total + + @cached_property + def stdio_ops(self) -> int: + return self.operations[ModuleType.STDIO].total + + @cached_property + def mpiio_ops(self) -> int: + return self.operations[ModuleType.MPIIO].total + + # Aggregated size properties + @cached_property + def read_bytes(self) -> int: + """Total bytes read across all modules.""" + return sum(size.read for size in self.sizes.values()) + + @cached_property + def written_bytes(self) -> int: + """Total bytes written across all modules.""" + return sum(size.write for size in self.sizes.values()) + + @cached_property + def total_bytes(self) -> int: + """Total bytes transferred across all modules.""" + return self.read_bytes + self.written_bytes + + # Aggregated operation properties + @cached_property + def reads(self) -> int: + """Total read operations across all modules.""" + return sum(op.read for op in self.operations.values()) + + @cached_property + def writes(self) -> int: + """Total write operations across all modules.""" + return sum(op.write for op in self.operations.values()) + + @cached_property + def total_ops(self) -> int: + """Total operations across all modules.""" + return self.reads + self.writes + + # Methods to get stats for specific modules + def get_module_size( + self, + module: Optional[Union[ModuleType, str]] = None, + data_type: Optional[str] = "total", + ) -> int: + """Get size statistics for a specific module or all modules if not specified.""" + if module is None and data_type is None: + raise ValueError("Both module and data_type cannot be None") + + if module: + if module not in self.sizes: + raise ValueError(f"Module {module} not found in sizes") + size = self.sizes[module] + if data_type == "read": + return size.read + elif data_type == "write": + return size.write + else: # data_type is None or "total" + return size.total + else: + if data_type == "read": + return self.read_bytes + elif data_type == "write": + return self.written_bytes + else: # data_type is None or "total" + return self.total_bytes + + def get_module_ops( + self, + module: Optional[Union[ModuleType, str]] = None, + data_type: Optional[str] = "total", + ) -> int: + """Get operation statistics for a specific module or all modules if not specified.""" + if module is None and data_type is None: + raise ValueError("Both module and data_type cannot be None") + + if module: + if module not in self.operations: + raise ValueError(f"Module {module} not found in operations") + ops = self.operations[module] + if data_type == "read": + return ops.read + elif data_type == "write": + return ops.write + else: # data_type is None or "total" + return ops.total + else: + if data_type == "read": + return self.reads + elif data_type == "write": + return self.writes + else: # data_type is None or "total" + return self.total_ops + + +@dataclass +class SmallIOStats(IOCounter): + """Statistics for small I/O operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class SharedOpsStats(IOCounter): + """Statistics for shared file operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class SharedSmallOpsStats(IOCounter): + """Statistics for small shared file operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class ConsecutiveIOStats(IOCounter): + """Statistics for consecutive I/O operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class SequentialIOStats(IOCounter): + """Statistics for sequential I/O operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class RandomIOStats(IOCounter): + """Statistics for random I/O operations""" + + pass # Inherits read/write/total from IOCounter + + +@dataclass +class MPIIONonBlockingStats(IOCounter): + """Statistics for non-blocking MPI I/O operations""" + + pass + + +@dataclass +class MPICollectiveIOStats(IOCounter): + """Statistics for collective MPI I/O operations""" + + pass + + +@dataclass +class MPIIndependentIOStats(IOCounter): + """Statistics for independent MPI I/O operations""" + + pass + + +@dataclass +class AccessPatternStats: + """Statistics for I/O access patterns by pattern type""" + + consecutive: ConsecutiveIOStats = field( + default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True + ) + sequential: SequentialIOStats = field( + default_factory=lambda: SequentialIOStats(read=0, write=0), init=True + ) + random: RandomIOStats = field( + default_factory=lambda: RandomIOStats(read=0, write=0), init=True + ) + + +@dataclass +class DarshanFile: + # TODO: All fields which are not calculated should be instantly populated and not optional + # TODO: Explore using typeddicts instead of dicts + file_path: str + _darshan_report: Optional[DarshanReport] = None + job_id: Optional[str] = None + log_ver: Optional[str] = None + time: Optional[TimeSpan] = None + exe: Optional[str] = None + _modules: Optional[Iterable[str]] = None + _name_records: Optional[Dict[int, str]] = None # Keys are uint64 + _max_read_offset: Optional[int] = None + _max_write_offset: Optional[int] = None + total_files_stdio: Optional[int] = None + total_files_posix: Optional[int] = None + total_files_mpiio: Optional[int] = None + files: Optional[Dict[str, str]] = None + + # Replace individual I/O stats with IOStatistics class + _io_stats: Optional[IOStatistics] = None + + # File counts + total_files: Optional[int] = 0 + + # Additional I/O statistics organized by category + _posix_small_io: Optional[SmallIOStats] = None + + _posix_detected_small_files: Optional[pd.DataFrame] = None + + # Direct alignment fields instead of a class + _mem_not_aligned: Optional[int] = None + _file_not_aligned: Optional[int] = None + + _posix_read_consecutive: Optional[int] = None + _posix_write_consecutive: Optional[int] = None + _posix_read_sequential: Optional[int] = None + _posix_write_sequential: Optional[int] = None + _posix_read_random: Optional[int] = None + _posix_write_random: Optional[int] = None + + _posix_long_metadata_count: Optional[int] = None + _posix_data_stragglers_count: Optional[int] = None + _posix_time_stragglers_count: Optional[int] = None + _posix_write_imbalance_count: Optional[int] = None + _posix_read_imbalance_count: Optional[int] = None + + access_pattern: Optional[AccessPatternStats] = None + + # Use separate classes for shared operations + _shared_ops: Optional[SharedOpsStats] = None + shared_small_ops: Optional[SharedSmallOpsStats] = None + + count_long_metadata: Optional[int] = None + posix_shared_data_imbalance_stragglers_count: Optional[int] = None + + _has_hdf5_extension: Optional[bool] = None + + _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None + + _cb_nodes: Optional[int] = None + _number_of_compute_nodes: Optional[int] = None + hints: Optional[List[str]] = None + + timestamp: Optional[TimeSpan] = None + + aggregated: Optional[pd.DataFrame] = None + + _mpi_coll_ops: Optional[MPICollectiveIOStats] = None + _mpi_indep_ops: Optional[MPIIndependentIOStats] = None + + detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None + detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None + + imbalance_count_posix_shared_time: Optional[int] = None + posix_shared_time_imbalance_detected_files: Optional[ + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] + ] = None + + @cached_property + def report(self) -> DarshanReport: + if self._darshan_report is None: + self._darshan_report = DarshanReport(self.file_path) + return self._darshan_report + + @cached_property + def modules(self) -> Iterable[str]: + if self._modules is None: + self._modules = set(self.report.records.keys()) + return self._modules + + @cached_property + def io_stats(self) -> IOStatistics: + if self._io_stats is None: + # Calculate I/O sizes + sizes: Dict[ModuleType, IOSize] = {} + ops: Dict[ModuleType, IOOperation] = {} + if ModuleType.STDIO in self.modules: + df = self.report.records[ModuleType.STDIO].to_df() + counters = df["counters"] + assert df, "STDIO module data frame is empty" + + stdio_read_size = counters["STDIO_BYTES_READ"].sum() + stdio_write_size = counters["STDIO_BYTES_WRITTEN"].sum() + sizes[ModuleType.STDIO] = IOSize( + read=stdio_read_size, write=stdio_write_size + ) + + stdio_read_ops = counters["STDIO_READS"].sum() + stdio_write_ops = counters["STDIO_WRITES"].sum() + ops[ModuleType.STDIO] = IOOperation( + read=stdio_read_ops, write=stdio_write_ops + ) + + if ModuleType.POSIX in self.modules: + df = self.report.records[ModuleType.POSIX].to_df() + counters = df["counters"] + assert df, "POSIX module data frame is empty" + + posix_write_size = counters["POSIX_BYTES_WRITTEN"].sum() + posix_read_size = counters["POSIX_BYTES_READ"].sum() + sizes[ModuleType.POSIX] = IOSize( + read=posix_read_size, write=posix_write_size + ) + + posix_read_ops = counters["POSIX_READS"].sum() + posix_write_ops = counters["POSIX_WRITES"].sum() + ops[ModuleType.POSIX] = IOOperation( + read=posix_read_ops, write=posix_write_ops + ) + + if ModuleType.MPIIO in self.modules: + df = self.report.records[ModuleType.MPIIO].to_df() + counters = df["counters"] + assert df, "MPIIO module data frame is empty" + + mpiio_write_size = counters["MPIIO_BYTES_WRITTEN"].sum() + mpiio_read_size = counters["MPIIO_BYTES_READ"].sum() + sizes[ModuleType.MPIIO] = IOSize( + read=mpiio_read_size, write=mpiio_write_size + ) + + mpiio_read_ops = counters['MPIIO_INDEP_READS'].sum() + counters['MPIIO_COLL_READS'].sum() + mpiio_write_ops = counters['MPIIO_INDEP_WRITES'].sum() + counters['MPIIO_COLL_WRITES'].sum() + ops[ModuleType.MPIIO] = IOOperation( + read=mpiio_read_ops, write=mpiio_write_ops + ) + + self._io_stats = IOStatistics(sizes=sizes, operations=ops) + return self._io_stats + + @cached_property + def posix_small_io(self) -> SmallIOStats: + if self._posix_small_io is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + posix_reads_small = ( + posix_counters["POSIX_SIZE_READ_0_100"].sum() + + posix_counters["POSIX_SIZE_READ_100_1K"].sum() + + posix_counters["POSIX_SIZE_READ_1K_10K"].sum() + + posix_counters["POSIX_SIZE_READ_10K_100K"].sum() + + posix_counters["POSIX_SIZE_READ_100K_1M"].sum() + ) + posix_writes_small = ( + posix_counters["POSIX_SIZE_WRITE_0_100"].sum() + + posix_counters["POSIX_SIZE_WRITE_100_1K"].sum() + + posix_counters["POSIX_SIZE_WRITE_1K_10K"].sum() + + posix_counters["POSIX_SIZE_WRITE_10K_100K"].sum() + + posix_counters["POSIX_SIZE_WRITE_100K_1M"].sum() + ) + self._posix_small_io = SmallIOStats( + read=posix_reads_small, write=posix_writes_small + ) + return self._posix_small_io + + @property + def posix_detected_small_files(self) -> pd.DataFrame: + if self._posix_detected_small_files is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + posix_counters["INSIGHTS_POSIX_SMALL_READ"] = ( + posix_counters["POSIX_SIZE_READ_0_100"] + + posix_counters["POSIX_SIZE_READ_100_1K"] + + posix_counters["POSIX_SIZE_READ_1K_10K"] + + posix_counters["POSIX_SIZE_READ_10K_100K"] + + posix_counters["POSIX_SIZE_READ_100K_1M"] + ) + posix_counters["INSIGHTS_POSIX_SMALL_WRITE"] = ( + posix_counters["POSIX_SIZE_WRITE_0_100"] + + posix_counters["POSIX_SIZE_WRITE_100_1K"] + + posix_counters["POSIX_SIZE_WRITE_1K_10K"] + + posix_counters["POSIX_SIZE_WRITE_10K_100K"] + + posix_counters["POSIX_SIZE_WRITE_100K_1M"] + ) + detected_files = pd.DataFrame( + posix_counters.groupby("id")[ + ["INSIGHTS_POSIX_SMALL_READ", "INSIGHTS_POSIX_SMALL_WRITE"] + ].sum() + ).reset_index() + detected_files.columns = pd.Index(["id", "total_reads", "total_writes"]) + detected_files.loc[:, "id"] = detected_files.loc[:, "id"].astype(str) + self._posix_detected_small_files = detected_files + return self._posix_detected_small_files + + @property + def file_map(self) -> Dict[int, str]: + return self.name_records + + @cached_property + def name_records(self) -> Dict[int, str]: + if self._name_records is None: + self._name_records = self.report.name_records + return self._name_records + + @property + def dxt_posix_df(self) -> Optional[pd.DataFrame]: + if parser.args.backtrace is False: + return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df()) + return dxt_posix_df + + @property + def dxt_posix_read_df(self) -> Optional[pd.DataFrame]: + if parser.args.backtrace is False: + return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + df = self.dxt_posix_df + assert df is not None, "Should be handled by parser.args.backtrace check" + + if "address_line_mapping" not in df: + parser.args.backtrace = False + return None + + read_id = [] + read_rank = [] + read_length = [] + read_offsets = [] + read_end_time = [] + read_start_time = [] + read_operation = [] + + for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]): + if not r[1].empty: + read_id.append([r[3]] * len((r[1]["length"].to_list()))) + read_rank.append([r[0]] * len((r[1]["length"].to_list()))) + read_length.append(r[1]["length"].to_list()) + read_end_time.append(r[1]["end_time"].to_list()) + read_start_time.append(r[1]["start_time"].to_list()) + read_operation.append(["read"] * len((r[1]["length"].to_list()))) + read_offsets.append(r[1]["offset"].to_list()) + + read_id = [element for nestedlist in read_id for element in nestedlist] + read_rank = [element for nestedlist in read_rank for element in nestedlist] + read_length = [element for nestedlist in read_length for element in nestedlist] + read_offsets = [ + element for nestedlist in read_offsets for element in nestedlist + ] + read_end_time = [ + element for nestedlist in read_end_time for element in nestedlist + ] + read_operation = [ + element for nestedlist in read_operation for element in nestedlist + ] + read_start_time = [ + element for nestedlist in read_start_time for element in nestedlist + ] + + dxt_posix_read_data = { + "id": read_id, + "rank": read_rank, + "length": read_length, + "end_time": read_end_time, + "start_time": read_start_time, + "operation": read_operation, + "offsets": read_offsets, + } + + return pd.DataFrame(dxt_posix_read_data) + + @property + def dxt_posix_write_df(self) -> Optional[pd.DataFrame]: + if parser.args.backtrace is False: + return None + assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module" + df = self.dxt_posix_df + assert df is not None, "Should be handled by parser.args.backtrace check" + + if "address_line_mapping" not in df: + parser.args.backtrace = False + return None + + write_id = [] + write_rank = [] + write_length = [] + write_offsets = [] + write_end_time = [] + write_start_time = [] + write_operation = [] + + for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]): + if not r[2].empty: + write_id.append([r[3]] * len((r[2]["length"].to_list()))) + write_rank.append([r[0]] * len((r[2]["length"].to_list()))) + write_length.append(r[2]["length"].to_list()) + write_end_time.append(r[2]["end_time"].to_list()) + write_start_time.append(r[2]["start_time"].to_list()) + write_operation.append(["write"] * len((r[2]["length"].to_list()))) + write_offsets.append(r[2]["offset"].to_list()) + + write_id = [element for nestedlist in write_id for element in nestedlist] + write_rank = [element for nestedlist in write_rank for element in nestedlist] + write_length = [ + element for nestedlist in write_length for element in nestedlist + ] + write_offsets = [ + element for nestedlist in write_offsets for element in nestedlist + ] + write_end_time = [ + element for nestedlist in write_end_time for element in nestedlist + ] + write_operation = [ + element for nestedlist in write_operation for element in nestedlist + ] + write_start_time = [ + element for nestedlist in write_start_time for element in nestedlist + ] + + dxt_posix_write_data = pd.DataFrame( + { + "id": write_id, + "rank": write_rank, + "length": write_length, + "end_time": write_end_time, + "start_time": write_start_time, + "operation": write_operation, + "offsets": write_offsets, + } + ) + + return pd.DataFrame(dxt_posix_write_data) + + @cached_property + def mem_not_aligned(self) -> int: + if self._mem_not_aligned is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._mem_not_aligned = posix_counters["POSIX_MEM_NOT_ALIGNED"].sum() + return self._mem_not_aligned + + @cached_property + def file_not_aligned(self) -> int: + if self._file_not_aligned is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._file_not_aligned = posix_counters["POSIX_FILE_NOT_ALIGNED"].sum() + return self._file_not_aligned + + @property + def lustre_df(self) -> Optional[pd.DataFrame]: + if "LUSTRE" not in self.modules: + return None + lustre_dict = self.report.records["LUSTRE"].to_df() + assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}" + try: + lustre_df = lustre_dict["components"] + except KeyError: + # Using an older PyDarshan version + lustre_df = lustre_dict["counters"] + return lustre_df + + @cached_property + def max_read_offset(self) -> int: + if self._max_read_offset is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._max_read_offset = posix_counters["POSIX_MAX_BYTE_READ"].max() + return self._max_read_offset + + @cached_property + def max_write_offset(self) -> int: + if self._max_write_offset is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._max_write_offset = posix_counters["POSIX_MAX_BYTE_WRITTEN"].max() + return self._max_write_offset + + @cached_property + def posix_read_consecutive(self) -> int: + if self._posix_read_consecutive is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_consecutive = posix_counters["POSIX_CONSEC_READS"].sum() + return self._posix_read_consecutive + + @cached_property + def posix_write_consecutive(self) -> int: + if self._posix_write_consecutive is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_consecutive = posix_counters["POSIX_CONSEC_WRITES"].sum() + return self._posix_write_consecutive + + @cached_property + def posix_read_sequential(self) -> int: + if self._posix_read_sequential is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_sequential = ( + posix_counters["POSIX_SEQ_READS"].sum() - self.posix_read_consecutive + ) + return self._posix_read_sequential + + @cached_property + def posix_write_sequential(self) -> int: + if self._posix_write_sequential is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_sequential = ( + posix_counters["POSIX_SEQ_WRITES"].sum() - self.posix_write_consecutive + ) + return self._posix_write_sequential + + @cached_property + def posix_read_random(self) -> int: + if self._posix_read_random is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_read_random = ( + self.io_stats.get_module_ops(ModuleType.POSIX, "read") + - self.posix_read_consecutive + - self.posix_read_sequential + ) + return self._posix_read_random + + @cached_property + def posix_write_random(self) -> int: + if self._posix_write_random is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._posix_write_random = ( + self.io_stats.get_module_ops(ModuleType.POSIX, "write") + - self.posix_write_consecutive + - self.posix_write_sequential + ) + return self._posix_write_random + + @property + def posix_shared_files_df(self) -> pd.DataFrame: + assert "POSIX" in self.modules, "Missing POSIX module" + posix_df = self.report.records[ModuleType.POSIX].to_df() + shared_files_df = posix_df["counters"].loc[(posix_df["counters"]["rank"] == -1)] + shared_files_df = shared_files_df.assign(id=lambda d: d["id"].astype(str)) + return shared_files_df + + @cached_property + def posix_shared_reads(self) -> int: + if self._shared_ops is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._shared_ops = SharedOpsStats( + read=posix_counters["POSIX_SHARED_READS"].sum(), + write=posix_counters["POSIX_SHARED_WRITES"].sum(), + ) + return self._shared_ops.read + + @cached_property + def posix_shared_writes(self) -> int: + if self._shared_ops is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_counters = posix_df["counters"] + self._shared_ops = SharedOpsStats( + read=posix_counters["POSIX_SHARED_READS"].sum(), + write=posix_counters["POSIX_SHARED_WRITES"].sum(), + ) + return self._shared_ops.write + + @cached_property + def posix_long_metadata_count(self) -> int: + if self._posix_long_metadata_count is None: + posix_df = self.report.records[ModuleType.POSIX].to_df() + posix_long_metadata_rows = posix_df["fcounters"][ + ( + posix_df["fcounters"]["POSIX_F_META_TIME"] + > config.thresholds["metadata_time_rank"][0] + ) + ] + self._posix_long_metadata_count = len(posix_long_metadata_rows) + return self._posix_long_metadata_count + + @property + def posix_data_stragglers_df(self) -> pd.DataFrame: + shared_files = self.posix_shared_files_df + + detected_files = [] + + for index, row in shared_files.iterrows(): + total_transfer_size = row["POSIX_BYTES_WRITTEN"] + row["POSIX_BYTES_READ"] + + if ( + total_transfer_size + and abs( + row["POSIX_SLOWEST_RANK_BYTES"] - row["POSIX_FASTEST_RANK_BYTES"] + ) + / total_transfer_size + > config.thresholds["imbalance_stragglers"][0] + ): + # stragglers_count += 1 + + detected_files.append( + [ + row["id"], + abs( + row["POSIX_SLOWEST_RANK_BYTES"] + - row["POSIX_FASTEST_RANK_BYTES"] + ) + / total_transfer_size + * 100, + ] + ) + + column_names = ["id", "data_imbalance"] + detected_files = pd.DataFrame(detected_files, columns=column_names) + return detected_files + + @cached_property + def posix_data_stragglers_count(self) -> int: + if self._posix_data_stragglers_count is None: + self._posix_data_stragglers_count = len(self.posix_data_stragglers_df) + return self._posix_data_stragglers_count + + @property + def posix_time_stragglers_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + shared_files_times = df['fcounters'].loc[(df['fcounters']['rank'] == -1)] + + # Get the files responsible + detected_files = [] + + # stragglers_count = 0 + # stragglers_imbalance = {} + + shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str)) + + for index, row in shared_files_times.iterrows(): + total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME'] + + if total_transfer_time and abs( + row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \ + config.thresholds['imbalance_stragglers'][0]: + # stragglers_count += 1 + + detected_files.append([ + row['id'], + abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100 + ]) + + column_names = ['id', 'time_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_time_stragglers_count(self) -> int: + if self._posix_time_stragglers_count is None: + self._posix_time_stragglers_count = len(self.posix_time_stragglers_df) + return self._posix_time_stragglers_count + + @property + def posix_write_imbalance_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ + ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] + ].groupby('id', as_index=False).agg({ + 'rank': 'nunique', + 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'], + 'POSIX_BYTES_READ': ['sum', 'min', 'max'] + }) + + aggregated.columns = list(map('_'.join, aggregated.columns.values)) + + aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str)) + + # Get the files responsible + imbalance_count = 0 + + detected_files = [] + + for index, row in aggregated.iterrows(): + if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \ + row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]: + imbalance_count += 1 + + detected_files.append([ + row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[ + 'POSIX_BYTES_WRITTEN_max'] * 100 + ]) + + column_names = ['id', 'write_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_write_imbalance_count(self) -> int: + if self._posix_write_imbalance_count is None: + self._posix_write_imbalance_count = len(self.posix_write_imbalance_df) + return self._posix_write_imbalance_count + + @property + def posix_read_imbalance_df(self) -> pd.DataFrame: + df = self.report.records[ModuleType.POSIX].to_df() + + aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ + ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] + ].groupby('id', as_index=False).agg({ + 'rank': 'nunique', + 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'], + 'POSIX_BYTES_READ': ['sum', 'min', 'max'] + }) + + aggregated.columns = list(map('_'.join, aggregated.columns.values)) + + aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str)) + + + imbalance_count = 0 + + detected_files = [] + + for index, row in aggregated.iterrows(): + if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[ + 'POSIX_BYTES_READ_max'] > config.thresholds['imbalance_size'][0]: + imbalance_count += 1 + + detected_files.append([ + row['id'], + abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100 + ]) + + column_names = ['id', 'read_imbalance'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def posix_read_imbalance_count(self) -> int: + if self._posix_read_imbalance_count is None: + self._posix_read_imbalance_count = len(self.posix_read_imbalance_df) + return self._posix_read_imbalance_count + + @cached_property + def mpi_coll_ops(self) -> MPICollectiveIOStats: + if self._mpi_coll_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_coll_reads = mpi_df['counters']['MPIIO_COLL_READS'].sum() + mpiio_coll_writes = mpi_df['counters']['MPIIO_COLL_WRITES'].sum() + self._mpi_coll_ops = MPICollectiveIOStats(read=mpi_coll_reads, write=mpiio_coll_writes) + return self._mpi_coll_ops + + @cached_property + def mpi_indep_ops(self) -> MPIIndependentIOStats: + if self._mpi_indep_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_indep_reads = mpi_df['counters']['MPIIO_INDEP_READS'].sum() + mpi_indep_writes = mpi_df['counters']['MPIIO_INDEP_WRITES'].sum() + self._mpi_indep_ops = MPIIndependentIOStats(read=mpi_indep_reads, write=mpi_indep_writes) + return self._mpi_indep_ops + + @property + def mpi_read_df(self) -> pd.DataFrame: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + counters = mpi_df['counters'] + mpi_coll_reads = self.mpi_coll_ops.read + mpi_total_reads = self.io_stats.get_module_ops(ModuleType.MPIIO, "read") + + detected_files = [] + + if mpi_coll_reads == 0 and mpi_total_reads and mpi_total_reads > \ + config.thresholds['collective_operations_absolute'][0]: + files = pd.DataFrame(counters.groupby('id').sum()).reset_index() + for index, row in counters.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations_absolute'][0]): + detected_files.append([ + row['id'], row['MPIIO_INDEP_READS'], + row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @property + def dxt_mpi_df(self) -> Optional[pd.DataFrame]: + if not parser.args.backtrace: + return None + if "DXT_MPIIO" not in self.modules: + return None + + dxt_mpiio = self.report.records["DXT_MPIIO"].to_df() + dxt_mpiio = pd.DataFrame(dxt_mpiio) + return dxt_mpiio + + @property + def mpi_write_df(self) -> pd.DataFrame: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + counters = mpi_df['counters'] + + mpi_coll_writes = self.mpi_coll_ops.write + total_mpiio_write_operations = self.io_stats.get_module_ops(ModuleType.MPIIO, "write") + + + detected_files = [] + if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \ + config.thresholds['collective_operations_absolute'][0]: + files = pd.DataFrame(counters.groupby('id').sum()).reset_index() + + for index, row in counters.iterrows(): + if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations'][0] and + (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > + config.thresholds['collective_operations_absolute'][0]): + detected_files.append([ + row['id'], row['MPIIO_INDEP_WRITES'], + row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100 + ]) + + column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] + detected_files = pd.DataFrame(detected_files, columns=column_names) + + return detected_files + + @cached_property + def mpiio_nb_ops(self) -> MPIIONonBlockingStats: + if self._mpiio_nb_ops is None: + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + mpi_nb_reads = mpi_df['counters']['MPIIO_NB_READS'].sum() + mpi_nb_writes = mpi_df['counters']['MPIIO_NB_WRITES'].sum() + self._mpiio_nb_ops = MPIIONonBlockingStats(read=mpi_nb_reads, write=mpi_nb_writes) + return self._mpiio_nb_ops + + @cached_property + def has_hdf5_extension(self) -> bool: + if self._has_hdf5_extension is None: + self._has_hdf5_extension = False + mpi_df = self.report.records[ModuleType.MPIIO].to_df() + # for index, row in mpi_df['counters'].iterrows(): # Implicitly converts all data to np.float64. Problematic for id (np.uint64) + for row in mpi_df['counters'].itertuples(index=False): + # if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'): + if self.file_map[row.id].endswith('.h5') or self.file_map[row.id].endswith('.hdf5'): + self._has_hdf5_extension = True + break + return self._has_hdf5_extension + + @cached_property + def cb_nodes(self) -> int: + if self._cb_nodes is None: + assert ModuleType.MPIIO in self.modules, "Missing MPIIO module" + hints = "" + if 'h' in self.report.metadata['job']['metadata']: + hints = self.report.metadata['job']['metadata']['h'] + if hints: + hints = hints.split(';') + + cb_nodes = None + + for hint in hints: + if hint != 'no': + (key, value) = hint.split('=') + + if key == 'cb_nodes': + cb_nodes = value + return self._cb_nodes + + @cached_property + def number_of_compute_nodes(self) -> int: + if self._number_of_compute_nodes is None: + assert ModuleType.MPIIO in self.modules, "Missing MPIIO module" + command = 'sacct --job {} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'.format( + self.report.metadata['job']['jobid'] + ) + arguments = shlex.split(command) + + try: + result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode == 0: + # We have successfully fetched the information from SLURM + db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8'))) + + try: + first = next(db) + + if 'NNodes' in first: + self._number_of_compute_nodes = first['NNodes'] + + except StopIteration: + pass + except FileNotFoundError: + pass + return self._number_of_compute_nodes diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py index ea690f3..86dcf6f 100644 --- a/drishti/handlers/handle_darshan.py +++ b/drishti/handlers/handle_darshan.py @@ -1,18 +1,66 @@ #!/usr/bin/env python3 +import csv +import datetime import io -import sys -import time +import os import shlex import shutil import subprocess -import pandas as pd -import darshan -import darshan.backend.cffi_backend as darshanll +import sys +import time -from rich import print +import darshan # type: ignore +import darshan.backend.cffi_backend as darshanll # type: ignore +import numpy as np +import pandas as pd from packaging import version -from drishti.includes.module import * +from rich import print +from rich.padding import Padding +from rich.panel import Panel + +from drishti.handlers.darshan_util import DarshanFile, ModuleType + +from drishti.includes.config import ( + HIGH, + RECOMMENDATIONS, + WARN, + init_console, + insights_total, + thresholds, +) + +# from drishti.includes.module import * +import drishti.includes.module as module + +# from drishti.includes.module import ( +# check_individual_read_imbalance, +# check_individual_write_imbalance, +# check_long_metadata, +# check_misaligned, +# check_mpi_aggregator, +# check_mpi_collective_read_operation, +# check_mpi_collective_write_operation, +# check_mpi_none_block_operation, +# check_mpiio, +# check_operation_intensive, +# check_random_operation, +# check_shared_data_imblance, +# check_shared_small_operation, +# check_shared_time_imbalance, +# check_size_intensive, +# check_small_operation, +# check_stdio, +# check_traffic, +# display_content, +# display_footer, +# display_thresholds, +# export_csv, +# export_html, +# export_svg, +# ) +import drishti.includes.parser as parser +# from drishti.includes.parser import args def is_available(name): @@ -75,7 +123,8 @@ def handler(): insights_start_time = time.time() - log = darshanll.log_open(args.log_path) + darshan_log_path = parser.args.log_paths[0] + log = darshanll.log_open(darshan_log_path) modules = darshanll.log_get_modules(log) @@ -88,8 +137,8 @@ def handler(): library_version = darshanll.get_lib_version() # Make sure log format is of the same version - filename = args.log_path - # check_log_version(console, args.log_path, log_version, library_version) + filename = darshan_log_path + # check_log_version(console, darshan_log_path, log_version, library_version) darshanll.log_close(log) @@ -99,6 +148,9 @@ def handler(): job = report.metadata + ######################################################################################################################################################################### + darshan_file_obj = DarshanFile(file_path=darshan_log_path) + ######################################################################################################################################################################### # Check usage of STDIO, POSIX, and MPI-IO per file @@ -156,13 +208,12 @@ def handler(): df_lustre = None if "LUSTRE" in report.records: df_lustre = report.records['LUSTRE'].to_df() - - if args.backtrace: + if parser.args.backtrace: if "DXT_POSIX" in report.records: dxt_posix = report.records["DXT_POSIX"].to_df() dxt_posix = pd.DataFrame(dxt_posix) if "address_line_mapping" not in dxt_posix: - args.backtrace = False + parser.args.backtrace = False else: read_id = [] read_rank = [] @@ -290,8 +341,10 @@ def handler(): 'mpiio': uses_mpiio } - check_stdio(total_size, total_size_stdio) - check_mpiio(modules) + # module.check_stdio(total_size, total_size_stdio) + module.check_stdio(total_size=darshan_file_obj.io_stats.total_bytes, total_size_stdio=darshan_file_obj.io_stats.stdio_size) + # module.check_mpiio(modules) + module.check_mpiio(modules=darshan_file_obj.modules) ######################################################################################################################################################################### @@ -305,17 +358,27 @@ def handler(): total_writes = df['counters']['POSIX_WRITES'].sum() # Get total number of I/O operations - total_operations = total_writes + total_reads - - # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance - check_operation_intensive(total_operations, total_reads, total_writes) + total_operations = total_writes + total_reads + + # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance + # module.check_operation_intensive(total_operations, total_reads, total_writes) + module.check_operation_intensive( + total_operations=darshan_file_obj.io_stats.posix_ops, + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"), + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"), + ) total_read_size = df['counters']['POSIX_BYTES_READ'].sum() total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum() total_size = total_written_size + total_read_size - check_size_intensive(total_size, total_read_size, total_written_size) + # module.check_size_intensive(total_size, total_read_size, total_written_size) + module.check_size_intensive( + total_size=darshan_file_obj.io_stats.posix_size, + total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"), + total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"), + ) ######################################################################################################################################################################### @@ -359,7 +422,19 @@ def handler(): detected_files.columns = ['id', 'total_reads', 'total_writes'] detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str) - check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + + # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_small_operation( + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"), + total_reads_small=darshan_file_obj.posix_small_io.read, + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"), + total_writes_small=darshan_file_obj.posix_small_io.write, + detected_files=darshan_file_obj.posix_detected_small_files, modules=darshan_file_obj.modules, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### @@ -368,7 +443,17 @@ def handler(): total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum() total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum() - check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) + # module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data) + module.check_misaligned( + total_operations=darshan_file_obj.io_stats.posix_ops, + total_mem_not_aligned=darshan_file_obj.mem_not_aligned, + total_file_not_aligned=darshan_file_obj.file_not_aligned, + modules=darshan_file_obj.modules, + file_map=darshan_file_obj.file_map, + df_lustre=darshan_file_obj.lustre_df, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + ) ######################################################################################################################################################################### @@ -377,7 +462,16 @@ def handler(): max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max() max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max() - check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + # module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_traffic( + max_read_offset=darshan_file_obj.max_read_offset, + total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"), + max_write_offset=darshan_file_obj.max_write_offset, + total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"), + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### @@ -402,7 +496,30 @@ def handler(): write_random = total_writes - write_consecutive - write_sequential #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100)) - check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + + assert read_consecutive == darshan_file_obj.posix_read_consecutive + assert read_sequential == darshan_file_obj.posix_read_sequential + assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}" + assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, 'read')}" + assert write_consecutive == darshan_file_obj.posix_write_consecutive + assert write_sequential == darshan_file_obj.posix_write_sequential + assert write_random == darshan_file_obj.posix_write_random + assert total_writes == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write") + + # module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_random_operation( + read_consecutive=darshan_file_obj.posix_read_consecutive, + read_sequential=darshan_file_obj.posix_read_sequential, + read_random=darshan_file_obj.posix_read_random, + total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), + write_consecutive=darshan_file_obj.posix_write_consecutive, + write_sequential=darshan_file_obj.posix_write_sequential, + write_random=darshan_file_obj.posix_write_random, + total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write"), + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df, + ) ######################################################################################################################################################################### @@ -413,6 +530,7 @@ def handler(): shared_files = shared_files.assign(id=lambda d: d['id'].astype(str)) if not shared_files.empty: + # TODO: This entire conditional total_shared_reads = shared_files['POSIX_READS'].sum() total_shared_reads_small = ( shared_files['POSIX_SIZE_READ_0_100'].sum() + @@ -448,16 +566,22 @@ def handler(): shared_files['POSIX_SIZE_WRITE_100K_1M'] ) - check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) + # module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) + assert total_shared_reads == darshan_file_obj.posix_shared_reads + sys.exit(2) + module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map) ######################################################################################################################################################################### count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])]) - check_long_metadata(count_long_metadata, modules) + assert darshan_file_obj.posix_long_metadata_count == count_long_metadata + assert darshan_file_obj.modules == modules.keys(), f"{darshan_file_obj.modules} != {modules.keys()}" + # module.check_long_metadata(count_long_metadata, modules) + module.check_long_metadata(count_long_metadata=darshan_file_obj.posix_long_metadata_count, modules=darshan_file_obj.modules) # We already have a single line for each shared-file access - # To check for stragglers, we can check the difference between the + # To check for stragglers, we can check the difference between the # POSIX_FASTEST_RANK_BYTES # POSIX_SLOWEST_RANK_BYTES @@ -482,7 +606,21 @@ def handler(): column_names = ['id', 'data_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + assert stragglers_count == darshan_file_obj.posix_data_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_data_stragglers_count}" + assert detected_files.equals(darshan_file_obj.posix_data_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_data_stragglers_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}" + # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data) + module.check_shared_data_imblance( + stragglers_count=darshan_file_obj.posix_data_stragglers_count, + detected_files=darshan_file_obj.posix_data_stragglers_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df, + dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df + ) # POSIX_F_FASTEST_RANK_TIME # POSIX_F_SLOWEST_RANK_TIME @@ -494,7 +632,7 @@ def handler(): detected_files = [] stragglers_count = 0 - stragglers_imbalance = {} + # stragglers_imbalance = {} shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str)) @@ -510,7 +648,17 @@ def handler(): column_names = ['id', 'time_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_shared_time_imbalance(stragglers_count, detected_files, file_map) + + assert stragglers_count == darshan_file_obj.posix_time_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_time_stragglers_count}" + assert detected_files.equals(darshan_file_obj.posix_time_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_time_stragglers_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + + # module.check_shared_time_imbalance(stragglers_count, detected_files, file_map) + module.check_shared_time_imbalance( + stragglers_count=darshan_file_obj.posix_time_stragglers_count, + detected_files=darshan_file_obj.posix_time_stragglers_df, + file_map=darshan_file_obj.file_map, + ) aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ'] @@ -539,7 +687,22 @@ def handler(): column_names = ['id', 'write_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) + + assert imbalance_count == darshan_file_obj.posix_write_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_write_imbalance_count}" + assert detected_files.equals(darshan_file_obj.posix_write_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_write_imbalance_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}" + + # module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data) + module.check_individual_write_imbalance( + imbalance_count=darshan_file_obj.posix_write_imbalance_count, + detected_files=darshan_file_obj.posix_write_imbalance_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df + ) imbalance_count = 0 @@ -555,7 +718,21 @@ def handler(): column_names = ['id', 'read_imbalance'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) + + assert imbalance_count == darshan_file_obj.posix_read_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_read_imbalance_count}" + assert detected_files.equals(darshan_file_obj.posix_read_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_read_imbalance_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}" + assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}" + + # module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data) + module.check_individual_read_imbalance( + imbalance_count=darshan_file_obj.posix_read_imbalance_count, + detected_files=darshan_file_obj.posix_read_imbalance_df, + file_map=darshan_file_obj.file_map, + dxt_posix=darshan_file_obj.dxt_posix_df, + dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df + ) ######################################################################################################################################################################### @@ -590,7 +767,30 @@ def handler(): column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) + assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}" + assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}" + assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'read')}" + if detected_files.empty: + assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}" + assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}" + else: + assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + if dxt_mpiio is None: + assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}" + else: + assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + + # module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_read_operation( + mpiio_coll_reads=darshan_file_obj.mpi_coll_ops.read, + mpiio_indep_reads=darshan_file_obj.mpi_indep_ops.read, + total_mpiio_read_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), + detected_files=darshan_file_obj.mpi_read_df, + file_map=darshan_file_obj.file_map, + dxt_mpiio=darshan_file_obj.dxt_mpi_df + ) df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)] @@ -615,7 +815,30 @@ def handler(): column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes'] detected_files = pd.DataFrame(detected_files, columns=column_names) - check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) + assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}" + assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}" + assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'write')}" + if detected_files.empty: + assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}" + assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}" + else: + assert detected_files.equals(darshan_file_obj.mpi_write_df), f"{detected_files} != {darshan_file_obj.mpi_write_df}" + assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}" + if dxt_mpiio is None: + assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}" + else: + assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}" + + # module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio) + module.check_mpi_collective_write_operation( + mpiio_coll_writes=darshan_file_obj.mpi_coll_ops.write, + mpiio_indep_writes=darshan_file_obj.mpi_indep_ops.write, + total_mpiio_write_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), + detected_files=darshan_file_obj.mpi_write_df, + file_map=darshan_file_obj.file_map, + dxt_mpiio=darshan_file_obj.dxt_mpi_df, + ) ######################################################################################################################################################################### @@ -632,7 +855,18 @@ def handler(): mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum() mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum() - check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) + assert mpiio_nb_reads == darshan_file_obj.mpiio_nb_ops.read + assert mpiio_nb_writes == darshan_file_obj.mpiio_nb_ops.write + assert modules.keys() == darshan_file_obj.modules, f"{modules.keys()} != {darshan_file_obj.modules}" + assert has_hdf5_extension == darshan_file_obj.has_hdf5_extension, f"{has_hdf5_extension} != {darshan_file_obj.has_hdf5_extension}" + + # module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules) + module.check_mpi_none_block_operation( + mpiio_nb_reads=darshan_file_obj.mpiio_nb_ops.read, + mpiio_nb_writes=darshan_file_obj.mpiio_nb_ops.write, + has_hdf5_extension=darshan_file_obj.has_hdf5_extension, + modules=darshan_file_obj.modules, + ) ######################################################################################################################################################################### @@ -680,8 +914,14 @@ def handler(): if 'NNodes' in first: NUMBER_OF_COMPUTE_NODES = first['NNodes'] + assert cb_nodes == darshan_file_obj.cb_nodes, f"{cb_nodes} != {darshan_file_obj.cb_nodes}" + assert NUMBER_OF_COMPUTE_NODES == darshan_file_obj.number_of_compute_nodes, f"{NUMBER_OF_COMPUTE_NODES} != {darshan_file_obj.number_of_compute_nodes}" # Do we have one MPI-IO aggregator per node? - check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) + # module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES) + module.check_mpi_aggregator( + cb_nodes=darshan_file_obj.cb_nodes, + NUMBER_OF_COMPUTE_NODES=darshan_file_obj.number_of_compute_nodes + ) except StopIteration: pass except FileNotFoundError: @@ -711,7 +951,7 @@ def handler(): job['exe'].split()[0] ), ' [b]DARSHAN[/b]: [white]{}[/white]'.format( - os.path.basename(args.log_path) + os.path.basename(darshan_log_path) ), ' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format( job_start, @@ -748,14 +988,14 @@ def handler(): console.print() - display_content(console) - display_thresholds(console) - display_footer(console, insights_start_time, insights_end_time) + module.display_content(console) + module.display_thresholds(console) + module.display_footer(console, insights_start_time, insights_end_time) # Export to HTML, SVG, and CSV - trace_name = os.path.basename(args.log_path).replace('.darshan', '') - out_dir = args.export_dir if args.export_dir != "" else os.getcwd() + trace_name = os.path.basename(darshan_log_path).replace('.darshan', '') + out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd() - export_html(console, out_dir, trace_name) - export_svg(console, out_dir, trace_name) - export_csv(out_dir, trace_name, job['job']['jobid']) + module.export_html(console, out_dir, trace_name) + module.export_svg(console, out_dir, trace_name) + module.export_csv(out_dir, trace_name, job['job']['jobid']) diff --git a/drishti/includes/module.py b/drishti/includes/module.py index 9c2df16..52fac10 100644 --- a/drishti/includes/module.py +++ b/drishti/includes/module.py @@ -1,137 +1,211 @@ #!/usr/bin/env python3 -import datetime import csv +import datetime +import os import time +import typing + import pandas as pd from rich import box +from rich.console import Group +from rich.padding import Padding +from rich.panel import Panel from rich.syntax import Syntax -from drishti.includes.config import * -''' +from drishti.includes.config import * +from drishti.includes.config import ( + HIGH, + INFO, + OK, + ROOT, + TARGET_DEVELOPER, + TARGET_USER, + WARN, + codes, + convert_bytes, + csv_report, + insights_dxt, + insights_metadata, + insights_operation, + message, + set_export_theme, + thresholds, +) +from drishti.includes.parser import args + +""" Before calling the functions below Make sure the variables passed are in the given structure: file_map: a dict of (id, path) pair modules: a set or a dict should be ok detected_files: A pandas dataframe -''' +""" # Basic usage check + def check_stdio(total_size, total_size_stdio): - ''' + """ Check whether the application has excessively utilized standard input/output operations Parameters: total_size: total I/O size total_size_stdio: total STDIO size - - ''' - - if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]: - thresholds['interface_stdio'][1] = True - issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format( - total_size_stdio / total_size * 100.0, - convert_bytes(total_size_stdio) + + """ + + if total_size and total_size_stdio / total_size > thresholds["interface_stdio"][0]: + thresholds["interface_stdio"][1] = True + issue = "Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})".format( + total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio) ) recommendation = [ { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + "message": "Consider switching to a high-performance I/O interface such as MPI-IO" } ] insights_operation.append( - message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation + ) ) -def check_mpiio(modules): - ''' +def check_mpiio(modules: typing.Iterable[str]): + """ Check whether the application has used MPI-IO or not Parameter: modules: all different mudules been used in the application - ''' + """ - if 'MPI-IO' not in modules: - issue = 'Application is using low-performance interface' + if "MPI-IO" not in modules: + issue = "Application is using low-performance interface" recommendation = [ { - 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO' + "message": "Consider switching to a high-performance I/O interface such as MPI-IO" } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation + ) ) - # POSIX level check def check_operation_intensive(total_operations, total_reads, total_writes): - ''' + """ Check whether the application is read or write intensive Parameters: total_operations: number of I/O operations been executed by the application total_reads: number of read operations been executed by the application total_writes: number of write operations been executed by the application - ''' - - if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: - issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + """ + + if ( + total_writes > total_reads + and total_operations + and abs(total_writes - total_reads) / total_operations + > thresholds["imbalance_operations"][0] + ): + issue = "Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format( + total_writes / total_operations * 100.0, + total_reads / total_operations * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, + TARGET_DEVELOPER, + INFO, + issue, + None, + ) ) - if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]: - issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format( - total_writes / total_operations * 100.0, total_reads / total_operations * 100.0 + if ( + total_reads > total_writes + and total_operations + and abs(total_writes - total_reads) / total_operations + > thresholds["imbalance_operations"][0] + ): + issue = "Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format( + total_writes / total_operations * 100.0, + total_reads / total_operations * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) def check_size_intensive(total_size, total_read_size, total_written_size): - ''' + """ Check whether the application is read size intensive or written size intensive Parameters: total_size: Total I/O size measured in byte total_read_size: Input I/O size measured in byte total_written_size: Output I/O size measured in byte - ''' - - if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: - issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + """ + + if ( + total_written_size > total_read_size + and abs(total_written_size - total_read_size) / total_size + > thresholds["imbalance_operations"][0] + ): + issue = "Application is write size intensive ({:.2f}% write vs. {:.2f}% read)".format( + total_written_size / total_size * 100.0, + total_read_size / total_size * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) - if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]: - issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format( - total_written_size / total_size * 100.0, total_read_size / total_size * 100.0 + if ( + total_read_size > total_written_size + and abs(total_written_size - total_read_size) / total_size + > thresholds["imbalance_operations"][0] + ): + issue = "Application is read size intensive ({:.2f}% write vs. {:.2f}% read)".format( + total_written_size / total_size * 100.0, + total_read_size / total_size * 100.0, ) insights_metadata.append( - message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None) + message( + INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None + ) ) -def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_small_operation( + total_reads, + total_reads_small, + total_writes, + total_writes_small, + detected_files, + modules, + file_map, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has performed an excessive number of small operations Parameters: @@ -139,17 +213,21 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr total_reads_small: number of read operations that has small size total_writes: number of write operations been executed by the application total_writes_small: number of write operations that has small size - detected_files: + detected_files: total_reads and total_writes in each file required columns: ['id', 'total_reads', 'total_writes'] modules: all different mudules been used in the application file_map: file id and file name pairing df_posix: all POSIX records - ''' - - if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format( + """ + + if ( + total_reads_small + and total_reads_small / total_reads > thresholds["small_requests"][0] + and total_reads_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests".format( total_reads_small, total_reads_small / total_reads * 100.0 ) @@ -159,63 +237,93 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr dxt_trigger_time = 0 for index, row in detected_files.iterrows(): - if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2): + if row["total_reads"] > (total_reads * thresholds["small_requests"][0] / 2): detail.append( { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['total_reads'], - row['total_reads'] / total_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small read requests are to "{}"'.format( + row["total_reads"], + row["total_reads"] / total_reads * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - if not temp_df.empty: - temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]] - small_read_requests_ranks = temp_df['rank'].unique() - if len(small_read_requests_ranks) > 0: - if len(small_read_requests_ranks) > 1 and int(small_read_requests_ranks[0]) == 0: - rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[1]))] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + if not temp_df.empty: + temp_df = temp_df.loc[ + temp_df["length"] < thresholds["small_requests"][0] + ] + small_read_requests_ranks = temp_df["rank"].unique() + if len(small_read_requests_ranks) > 0: + if ( + len(small_read_requests_ranks) > 1 + and int(small_read_requests_ranks[0]) == 0 + ): + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_read_requests_ranks[1]) + ) + ] else: - rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[0]))] - - rank_df = rank_df['read_segments'].iloc[0] - rank_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_read_requests_ranks[0]) + ) + ] + + rank_df = rank_df["read_segments"].iloc[0] + rank_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ] res = set(list(address)) & set(rank_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0][ + "address_line_mapping" + ].loc[ + dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ].isin(res) + ] + if len(small_read_requests_ranks) > 0: detail.append( { - 'message': '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format( len(small_read_requests_ranks), - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) - + for index, row in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row['function_name'], - row['line_number'] - ) + "message": "{}: {}".format( + row["function_name"], row["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) @@ -223,40 +331,57 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr time_taken = end - start dxt_trigger_time += time_taken - if dxt_trigger_time > 0: + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation.append( { - 'message': 'Consider buffering read operations into larger more contiguous ones' + "message": "Consider buffering read operations into larger more contiguous ones" } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ) else: recommendation.append( { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations" } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) - if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format( + if ( + total_writes_small + and total_writes_small / total_writes > thresholds["small_requests"][0] + and total_writes_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests".format( total_writes_small, total_writes_small / total_writes * 100.0 ) @@ -266,106 +391,162 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr recommendation = [] file_count = 0 for index, row in detected_files.iterrows(): - if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2): + if row["total_writes"] > ( + total_writes * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small write requests are to "{}"'.format( - row['total_writes'], - row['total_writes'] / total_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small write requests are to "{}"'.format( + row["total_writes"], + row["total_writes"] / total_writes * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - - if not temp_df.empty: - temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]] - small_write_requests_ranks = temp_df['rank'].unique() + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + + if not temp_df.empty: + temp_df = temp_df.loc[ + temp_df["length"] < thresholds["small_requests"][0] + ] + small_write_requests_ranks = temp_df["rank"].unique() if len(small_write_requests_ranks) > 0: - if int(small_write_requests_ranks[0]) == 0 and len(small_write_requests_ranks) > 1: - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[1]))] + if ( + int(small_write_requests_ranks[0]) == 0 + and len(small_write_requests_ranks) > 1 + ): + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_write_requests_ranks[1]) + ) + ] else: - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))] - - rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))] - rank_df = rank_df['write_segments'].iloc[0] - rank_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[ + ( + temp["rank"] + == int(small_write_requests_ranks[0]) + ) + ] + + rank_df = temp.loc[ + (temp["rank"] == int(small_write_requests_ranks[0])) + ] + rank_df = rank_df["write_segments"].iloc[0] + rank_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ] res = set(list(address)) & set(rank_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0][ + "address_line_mapping" + ].loc[ + dxt_posix.iloc[0]["address_line_mapping"][ + "address" + ].isin(res) + ] + if len(small_write_requests_ranks) > 0: detail.append( { - 'message': '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format( len(small_write_requests_ranks), - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) - + for index, row in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row['function_name'], - row['line_number'] - ) + "message": "{}: {}".format( + row["function_name"], row["line_number"] + ) } ) - + file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to previous files' + "message": "The backtrace information for this file is similar to previous files" } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation.append( { - 'message': 'Consider buffering write operations into larger more contiguous ones' + "message": "Consider buffering write operations into larger more contiguous ones" } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ) else: recommendation.append( { - 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations' + "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations" } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map=None, df_lustre=None, dxt_posix=None, dxt_posix_read_data=None): - ''' +def check_misaligned( + total_operations, + total_mem_not_aligned, + total_file_not_aligned, + modules, + file_map=None, + df_lustre=None, + dxt_posix=None, + dxt_posix_read_data=None, +): + """ Check whether application has excessive misaligned operations Parameters: @@ -373,62 +554,80 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali total_mem_not_aligned: number of memory requests not aligned total_file_not_aligned: number of file requests not aligned modules: all different mudules been used in the application - ''' - - if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]: - thresholds['misaligned_requests'][1] = True - issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format( + """ + + if ( + total_operations + and total_mem_not_aligned / total_operations + > thresholds["misaligned_requests"][0] + ): + thresholds["misaligned_requests"][1] = True + issue = "Application has a high number ({:.2f}%) of misaligned memory requests".format( total_mem_not_aligned / total_operations * 100.0 ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None) + message( + INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + None, + ) ) - if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]: - thresholds['misaligned_requests'][1] = True - issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format( + if ( + total_operations + and total_file_not_aligned / total_operations + > thresholds["misaligned_requests"][0] + ): + thresholds["misaligned_requests"][1] = True + issue = "Application issues a high number ({:.2f}%) of misaligned file requests".format( total_file_not_aligned / total_operations * 100.0 ) recommendation = [ { - 'message': 'Consider aligning the requests to the file system block boundaries' + "message": "Consider aligning the requests to the file system block boundaries" } ] - if 'HF5' in modules: + if "HF5" in modules: recommendation.append( { - 'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default') + "message": "Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-alignment.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment' - } + "message": "Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment" + }, ) detail = [] - if 'LUSTRE' in modules: + if "LUSTRE" in modules: # DXT Analysis if args.backtrace: start = time.time() - - if not df_lustre['counters']['LUSTRE_STRIPE_SIZE'].empty: - stripe_size = df_lustre['counters']['LUSTRE_STRIPE_SIZE'].iloc[0] + + if not df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].empty: + stripe_size = df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].iloc[0] else: - stripe_size = df_lustre['counters']['POSIX_FILE_ALIGNMENT'].iloc[0] + stripe_size = df_lustre["counters"]["POSIX_FILE_ALIGNMENT"].iloc[0] file_count = 0 ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] + temp = dxt_posix.loc[dxt_posix["id"] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] misaligned_ranks = [] misaligned_ranks_opr = [] - + offsets = temp_df["offsets"].to_numpy().tolist() rank = temp_df["rank"].to_numpy().tolist() operation = temp_df["operation"].to_numpy().tolist() @@ -441,33 +640,46 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali if misaligned_ranks: misaligned_rank_ind = misaligned_ranks[0] misaligned_rank_opr = misaligned_ranks_opr[0] - misaligned_rank_df = temp.loc[(temp['rank'] == int(misaligned_rank_ind))] - if misaligned_rank_opr == 'read': - misaligned_rank_df = misaligned_rank_df['read_segments'].iloc[0] + misaligned_rank_df = temp.loc[ + (temp["rank"] == int(misaligned_rank_ind)) + ] + if misaligned_rank_opr == "read": + misaligned_rank_df = misaligned_rank_df[ + "read_segments" + ].iloc[0] else: - misaligned_rank_df = misaligned_rank_df['write_segments'].iloc[0] - misaligned_rank_stack_addresses = misaligned_rank_df['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + misaligned_rank_df = misaligned_rank_df[ + "write_segments" + ].iloc[0] + misaligned_rank_stack_addresses = misaligned_rank_df[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(misaligned_rank_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail.append( { - 'message': '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format( + "message": '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format( len(misaligned_ranks), - file_map[id] if args.full_path else os.path.basename(file_map[id]) - ) + file_map[id] + if args.full_path + else os.path.basename(file_map[id]), + ) } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 @@ -476,23 +688,43 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) recommendation.append( { - 'message': 'Consider using a Lustre alignment that matches the file system stripe configuration', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider using a Lustre alignment that matches the file system stripe configuration", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), } ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_traffic( + max_read_offset, + total_read_size, + max_write_offset, + total_written_size, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has redundant read or write traffic Parameters: @@ -500,10 +732,10 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ total_read_size: total size application has been read max_write_offset: max offset application is writing to total_written_size: total size application has been written - ''' + """ if max_read_offset > total_read_size: - issue = 'Application might have redundant read traffic (more data read than the highest offset)' + issue = "Application might have redundant read traffic (more data read than the highest offset)" detail = [] file_count = 0 @@ -513,67 +745,79 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == id] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == id] random_ranks_ind = -1 - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] updated_offsets = (temp_df["offsets"].to_numpy()).tolist() for i in range(len(updated_offsets)): - if updated_offsets.count(updated_offsets[i]) > 1: + if updated_offsets.count(updated_offsets[i]) > 1: redundant_ranks_ind = i break if random_ranks_ind != -1: - random_rank = temp_df.iloc[redundant_ranks_ind]['rank'] - random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['read_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[redundant_ranks_ind]["rank"] + random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["read_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] + detail.append( { - 'message': 'The backtrace information for these redundant read call(s) is given below:' + "message": "The backtrace information for these redundant read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message( + INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None + ) ) if max_write_offset > total_written_size: - issue = 'Application might have redundant write traffic (more data written than the highest offset)' + issue = "Application might have redundant write traffic (more data written than the highest offset)" detail = [] file_count = 0 @@ -583,70 +827,105 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == id] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == id] random_ranks_ind = -1 - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id] + temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id] updated_offsets = (temp_df["offsets"].to_numpy()).tolist() for i in range(len(updated_offsets)): - if updated_offsets.count(updated_offsets[i]) > 1: + if updated_offsets.count(updated_offsets[i]) > 1: redundant_ranks_ind = i break if random_ranks_ind != -1: - random_rank = temp_df.iloc[redundant_ranks_ind]['rank'] - random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['write_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[redundant_ranks_ind]["rank"] + random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["write_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] - + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] + detail.append( { - 'message': 'The backtrace information for these redundant write call(s) is given below:' + "message": "The backtrace information for these redundant write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) file_count += 1 else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None, detail) + message( + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + None, + detail, + ) ) insights_metadata.append( - message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None) + message( + INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + None, + ) ) -def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_random_operation( + read_consecutive, + read_sequential, + read_random, + total_reads, + write_consecutive, + write_sequential, + write_random, + total_writes, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check whether application has performed excessive random operations Parameters: @@ -658,19 +937,23 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total write_sequential: number of sequential write operations write_random: number of random write operations total_write: number of write operations been executed by the application - ''' + """ if total_reads: - if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]: - thresholds['random_operations'][1] = True - thresholds['random_operations_absolute'][1] = True - issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format( + if ( + read_random + and read_random / total_reads > thresholds["random_operations"][0] + and read_random > thresholds["random_operations_absolute"][0] + ): + thresholds["random_operations"][1] = True + thresholds["random_operations_absolute"][1] = True + issue = "Application is issuing a high number ({}) of random read operations ({:.2f}%)".format( read_random, read_random / total_reads * 100.0 ) recommendation = [ { - 'message': 'Consider changing your data model to have consecutive or sequential reads' + "message": "Consider changing your data model to have consecutive or sequential reads" } ] @@ -679,11 +962,11 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id] - temp_df = temp_df.sort_values('start_time', ascending=True) + temp = dxt_posix.loc[dxt_posix["id"] == id] + temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id] + temp_df = temp_df.sort_values("start_time", ascending=True) random_ranks_ind = -1 - + if not temp_df["offsets"].is_monotonic_increasing: updated_offsets = (temp_df["offsets"].to_numpy()).tolist() cur = 0 @@ -694,64 +977,90 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total cur = updated_offsets[i] if random_ranks_ind != -1: - random_rank = temp_df.iloc[random_ranks_ind]['rank'] - random_offsets = temp_df.iloc[random_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['read_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[random_ranks_ind]["rank"] + random_offsets = temp_df.iloc[random_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["read_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail = [] detail.append( { - 'message': 'The backtrace information for these random read call(s) is given below:' + "message": "The backtrace information for these random read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format( + issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests".format( read_consecutive / total_reads * 100.0, - read_sequential / total_reads * 100.0 + read_sequential / total_reads * 100.0, ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None) + message( + INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, + TARGET_DEVELOPER, + OK, + issue, + None, + ) ) if total_writes: - if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]: - thresholds['random_operations'][1] = True - thresholds['random_operations_absolute'][1] = True - issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format( + if ( + write_random + and write_random / total_writes > thresholds["random_operations"][0] + and write_random > thresholds["random_operations_absolute"][0] + ): + thresholds["random_operations"][1] = True + thresholds["random_operations_absolute"][1] = True + issue = "Application is issuing a high number ({}) of random write operations ({:.2f}%)".format( write_random, write_random / total_writes * 100.0 ) recommendation = [ { - 'message': 'Consider changing your data model to have consecutive or sequential writes' + "message": "Consider changing your data model to have consecutive or sequential writes" } ] @@ -760,10 +1069,10 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total start = time.time() ids = dxt_posix.id.unique().tolist() for id in ids: - temp = dxt_posix.loc[dxt_posix['id'] == id] + temp = dxt_posix.loc[dxt_posix["id"] == id] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id] - temp_df.sort_values('start_time', ascending=True, inplace=True) + temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id] + temp_df.sort_values("start_time", ascending=True, inplace=True) random_ranks_ind = -1 if not temp_df["offsets"].is_monotonic_increasing: updated_offsets = (temp_df["offsets"].to_numpy()).tolist() @@ -775,58 +1084,87 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total cur = updated_offsets[i] if random_ranks_ind != -1: - random_rank = temp_df.iloc[random_ranks_ind]['rank'] - random_offsets = temp_df.iloc[random_ranks_ind]['offsets'] - random_start_time = temp_df.iloc[random_ranks_ind]['start_time'] - - temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))] - temp_random_rank = temp_random_rank['write_segments'].iloc[0] - random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)] - random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0] - - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + random_rank = temp_df.iloc[random_ranks_ind]["rank"] + random_offsets = temp_df.iloc[random_ranks_ind]["offsets"] + random_start_time = temp_df.iloc[random_ranks_ind]["start_time"] + + temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))] + temp_random_rank = temp_random_rank["write_segments"].iloc[0] + random_stack_addresses = temp_random_rank.loc[ + (temp_random_rank["offset"] == random_offsets) + & (temp_random_rank["start_time"] == random_start_time) + ] + random_stack_addresses = random_stack_addresses[ + "stack_memory_addresses" + ].iloc[0] + + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(random_stack_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail = [] detail.append( { - 'message': 'The backtrace information for these random write call(s) is given below:' + "message": "The backtrace information for these random write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) - + end = time.time() time_taken = end - start detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(time_taken, 5) + ) } ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) else: - issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format( + issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests".format( write_consecutive / total_writes * 100.0, - write_sequential / total_writes * 100.0 + write_sequential / total_writes * 100.0, ) insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None) + message( + INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, + TARGET_DEVELOPER, + OK, + issue, + None, + ) ) -def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map): - ''' +def check_shared_small_operation( + total_shared_reads, + total_shared_reads_small, + total_shared_writes, + total_shared_writes_small, + shared_files, + file_map, +): + """ Check whether there are excessive small requests in shared files Parameters: @@ -838,113 +1176,182 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t small reads an small writes in each shared file required columns: ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES'] file_map: file id and file name pairing - ''' - - if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests'][1] = True - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format( - total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0 + """ + + if ( + total_shared_reads + and total_shared_reads_small / total_shared_reads + > thresholds["small_requests"][0] + and total_shared_reads_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests"][1] = True + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests".format( + total_shared_reads_small, + total_shared_reads_small / total_shared_reads * 100.0, ) detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2): + if row["INSIGHTS_POSIX_SMALL_READS"] > ( + total_shared_reads * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small read requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_READS'], - row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small read requests are to "{}"'.format( + row["INSIGHTS_POSIX_SMALL_READS"], + row["INSIGHTS_POSIX_SMALL_READS"] + / total_shared_reads + * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) - if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]: - thresholds['small_requests'][1] = True - thresholds['small_requests_absolute'][1] = True - issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format( - total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0 + if ( + total_shared_writes + and total_shared_writes_small / total_shared_writes + > thresholds["small_requests"][0] + and total_shared_writes_small > thresholds["small_requests_absolute"][0] + ): + thresholds["small_requests"][1] = True + thresholds["small_requests_absolute"][1] = True + issue = "Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests".format( + total_shared_writes_small, + total_shared_writes_small / total_shared_writes * 100.0, ) detail = [] for index, row in shared_files.iterrows(): - if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2): + if row["INSIGHTS_POSIX_SMALL_WRITES"] > ( + total_shared_writes * thresholds["small_requests"][0] / 2 + ): detail.append( { - 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format( - row['INSIGHTS_POSIX_SMALL_WRITES'], - row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0, - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({:.2f}%) small writes requests are to "{}"'.format( + row["INSIGHTS_POSIX_SMALL_WRITES"], + row["INSIGHTS_POSIX_SMALL_WRITES"] + / total_shared_writes + * 100.0, + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_long_metadata(count_long_metadata, modules): - ''' + """ Check how many ranks have metadata operations taking too long Parameters: count_long_metadata: number of ranks that have metadata operations taking too long modules: all different mudules been used in the application - ''' + """ if count_long_metadata > 0: - thresholds['metadata_time_rank'][1] = True - issue = 'There are {} ranks where metadata operations take over {} seconds'.format( - count_long_metadata, thresholds['metadata_time_rank'][0] + thresholds["metadata_time_rank"][1] = True + issue = ( + "There are {} ranks where metadata operations take over {} seconds".format( + count_long_metadata, thresholds["metadata_time_rank"][0] + ) ) recommendation = [ { - 'message': 'Attempt to combine files, reduce, or cache metadata operations' + "message": "Attempt to combine files, reduce, or cache metadata operations" } ] - if 'HF5' in modules: + if "HF5" in modules: recommendation.append( { - 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default') + "message": "Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-collective-metadata.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default') - } + "message": "Since your appplication uses HDF5, try using metadata cache to defer metadata operations", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-cache.c"), + line_numbers=True, + background_color="default", + ), + }, ) insights_metadata.append( - message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_HIGH_METADATA_TIME, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None): - ''' +def check_shared_data_imblance( + stragglers_count, + detected_files, + file_map, + dxt_posix=None, + dxt_posix_read_data=None, + dxt_posix_write_data=None, +): + """ Check how many shared files containing data transfer imbalance Parameters: @@ -953,11 +1360,11 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p data imbalance per file required columns: ['id', 'data_imbalance'] file_map: file id and file name pairing - ''' + """ if stragglers_count: - thresholds['imbalance_stragglers'][1] = True - issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format( + thresholds["imbalance_stragglers"][1] = True + issue = "Detected data transfer imbalance caused by stragglers when accessing {} shared file.".format( stragglers_count ) @@ -968,52 +1375,73 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['data_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["data_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df_1 = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - temp_df_2 = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - df_merged = pd.concat([temp_df_1, temp_df_2], ignore_index=True, sort=False) - df_merged['duration'] = df_merged['end_time'] - df_merged['start_time'] - df_merged.sort_values('duration', ascending=True, inplace=True) + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df_1 = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + temp_df_2 = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + df_merged = pd.concat( + [temp_df_1, temp_df_2], ignore_index=True, sort=False + ) + df_merged["duration"] = ( + df_merged["end_time"] - df_merged["start_time"] + ) + df_merged.sort_values("duration", ascending=True, inplace=True) df_merged = df_merged.iloc[0] - rank_df = temp.loc[(temp['rank'] == int(df_merged['rank']))] - - if df_merged['operation'] == 'write': - rank_df = rank_df['write_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = temp.loc[(temp["rank"] == int(df_merged["rank"]))] + + if df_merged["operation"] == "write": + rank_df = rank_df["write_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] else: - rank_df = rank_df['read_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + rank_df = rank_df["read_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[ + 0 + ] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin( + res + ) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced call(s) is given below:' + "message": "The backtrace information for these imbalanced call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1021,69 +1449,94 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) - + end = time.time() time_taken = end - start dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_SIZE_IMBALANCE, + TARGET_USER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size): - ''' +def check_shared_data_imblance_split( + slowest_rank_bytes, fastest_rank_bytes, total_transfer_size +): + """ Check whether the specific shared file contains data imbalance Parameters: slowest_rank_bytes: the total request size of the rank that takes the longest data operation time fastest_rank_bytes: the total request size of the rank that takes the shortest data operation time total_transfer_size: total request size of that specific shared file - ''' - - if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]: - thresholds['imbalance_stragglers'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( + """ + + if ( + total_transfer_size + and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size + > thresholds["imbalance_stragglers"][0] + ): + thresholds["imbalance_stragglers"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation + ) ) def check_shared_time_imbalance(stragglers_count, detected_files, file_map): - ''' + """ Check how many shared files containing time transfer imbalance Parameters: @@ -1092,74 +1545,101 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map): data imbalance per file required columns: ['id', 'time_imbalance'] file_map: file id and file name pairing - ''' + """ if stragglers_count: - thresholds['imbalance_stragglers'][1] = True - issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format( + thresholds["imbalance_stragglers"][1] = True + issue = "Detected time imbalance caused by stragglers when accessing {} shared file.".format( stragglers_count ) detail = [] - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['time_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["time_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) recommendation = [ { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_TIME_IMBALANCE, + TARGET_USER, + HIGH, + issue, + recommendation, + detail, + ) ) -def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time): - ''' +def check_shared_time_imbalance_split( + slowest_rank_time, fastest_rank_time, total_transfer_time +): + """ Check whether the specific shared file contains time imbalance Parameters: slowest_rank_bytes: the total request time of the rank that takes the longest data operation time fastest_rank_bytes: the total request time of the rank that takes the shortest data operation time total_transfer_size: total request time of that specific shared file - ''' - - if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]: - thresholds['imbalance_stragglers'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( + """ + + if ( + total_transfer_time + and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time + > thresholds["imbalance_stragglers"][0] + ): + thresholds["imbalance_stragglers"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100 ) recommendation = [ { - 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give + "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give }, { - 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') - } + "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), + }, ] insights_operation.append( - message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation + ) ) -def check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None): - ''' +def check_individual_write_imbalance( + imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None +): + """ Check how many write imbalance when accessing individual files Parameters: @@ -1167,57 +1647,62 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map, detected_files: write imbalance per file required columns: ['id', 'write_imbalance'] - ''' + """ if imbalance_count: - thresholds['imbalance_size'][1] = True - issue = 'Detected write imbalance when accessing {} individual files'.format( + thresholds["imbalance_size"][1] = True + issue = "Detected write imbalance when accessing {} individual files".format( imbalance_count ) detail = [] file_count = 0 dxt_trigger_time = 0 - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['write_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["write_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])] - - maxClm = temp_df['length'].max() - temp_df = temp_df.loc[(temp_df['length'] == maxClm)] - rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))] - - rank_df = rank_df['write_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_write_data.loc[ + dxt_posix_write_data["id"] == int(row["id"]) + ] + + maxClm = temp_df["length"].max() + temp_df = temp_df.loc[(temp_df["length"] == maxClm)] + rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))] + + rank_df = rank_df["write_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced write call(s) is given below:' + "message": "The backtrace information for these imbalanced write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1225,82 +1710,119 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map, else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } - ) - + ) + end = time.time() time_taken = end - start - dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + dxt_trigger_time += time_taken + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written): - ''' + """ Check whether there is write imbalance in the specific individual file Parameters: max_bytes_written: max byte written in the file min_bytes_written: minimum byte written in the file - ''' - - if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]: - thresholds['imbalance_size'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 + """ + + if ( + max_bytes_written + and abs(max_bytes_written - min_bytes_written) / max_bytes_written + > thresholds["imbalance_size"][0] + ): + thresholds["imbalance_size"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( + abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) -def check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None): - ''' +def check_individual_read_imbalance( + imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None +): + """ Check how many read imbalance when accessing individual files Parameters: @@ -1308,57 +1830,62 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d detected_files: read imbalance per file required columns: ['id', 'read_imbalance'] - ''' + """ if imbalance_count: - thresholds['imbalance_size'][1] = True - issue = 'Detected read imbalance when accessing {} individual files.'.format( + thresholds["imbalance_size"][1] = True + issue = "Detected read imbalance when accessing {} individual files.".format( imbalance_count ) detail = [] file_count = 0 dxt_trigger_time = 0 - + for index, row in detected_files.iterrows(): detail.append( { - 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( - row['read_imbalance'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format( + row["read_imbalance"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - if file_count < thresholds['backtrace'][0]: - temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])] - temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])] - - maxClm = temp_df['length'].max() - temp_df = temp_df.loc[(temp_df['length'] == maxClm)] - rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))] - - rank_df = rank_df['read_segments'].iloc[0] - stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0] - address = dxt_posix.iloc[0]['address_line_mapping']['address'] + if file_count < thresholds["backtrace"][0]: + temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])] + temp_df = dxt_posix_read_data.loc[ + dxt_posix_read_data["id"] == int(row["id"]) + ] + + maxClm = temp_df["length"].max() + temp_df = temp_df.loc[(temp_df["length"] == maxClm)] + rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))] + + rank_df = rank_df["read_segments"].iloc[0] + stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0] + address = dxt_posix.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[ + dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these imbalanced read call(s) is given below:' + "message": "The backtrace information for these imbalanced read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) @@ -1366,84 +1893,126 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d else: detail.append( { - 'message': 'The backtrace information for this file is similar to the previous files' + "message": "The backtrace information for this file is similar to the previous files" } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - if dxt_trigger_time > 0: + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read): - ''' + """ Check whether there is read imbalance in the specific individual file Parameters: max_bytes_written: max byte read in the file min_bytes_written: minimum byte read in the file - ''' - - if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]: - thresholds['imbalance_size'][1] = True - issue = 'Load imbalance of {:.2f}% detected'.format( - abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 + """ + + if ( + max_bytes_read + and abs(max_bytes_read - min_bytes_read) / max_bytes_read + > thresholds["imbalance_size"][0] + ): + thresholds["imbalance_size"][1] = True + issue = "Load imbalance of {:.2f}% detected".format( + abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100 ) recommendation = [ { - 'message': 'Consider better balancing the data transfer between the application ranks' + "message": "Consider better balancing the data transfer between the application ranks" }, { - 'message': 'Consider tuning the stripe size and count to better distribute the data', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default') + "message": "Consider tuning the stripe size and count to better distribute the data", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/lustre-striping.bash"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default') + "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"), + line_numbers=True, + background_color="default", + ), }, { - 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives' - } + "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives" + }, ] insights_operation.append( - message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) + message( + INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + ) ) # MPIIO level check -def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio=None): - ''' +def check_mpi_collective_read_operation( + mpiio_coll_reads, + mpiio_indep_reads, + total_mpiio_read_operations, + detected_files, + file_map, + dxt_mpiio=None, +): + """ Check whether application uses collective mpi read calls Parameters: @@ -1454,14 +2023,17 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot independent read operations and percentage per file required columns: ['id', 'absolute_indep_reads', 'percent_indep_reads'] file_map: file id and file name pairing - ''' + """ if mpiio_coll_reads == 0: - if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]: - thresholds['collective_operations_absolute'][1] = True - issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format( - mpiio_indep_reads, - mpiio_indep_reads / total_mpiio_read_operations * 100 + if ( + total_mpiio_read_operations + and total_mpiio_read_operations + > thresholds["collective_operations_absolute"][0] + ): + thresholds["collective_operations_absolute"][1] = True + issue = "Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls".format( + mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100 ) detail = [] @@ -1471,63 +2043,80 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot for index, row in detected_files.iterrows(): detail.append( { - 'message': '{} ({}%) of independent reads to "{}"'.format( - row['absolute_indep_reads'], - row['percent_indep_reads'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({}%) of independent reads to "{}"'.format( + row["absolute_indep_reads"], + row["percent_indep_reads"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)] - temp = temp['read_segments'].iloc[0] - stack_memory_addresses = temp['stack_memory_addresses'].iloc[0] - address = dxt_mpiio.iloc[0]['address_line_mapping']['address'] + temp = dxt_mpiio.loc[ + (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1) + ] + temp = temp["read_segments"].iloc[0] + stack_memory_addresses = temp["stack_memory_addresses"].iloc[0] + address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[ + dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these read call(s) is given below:' + "message": "The backtrace information for these read call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) - + end = time.time() time_taken = end - start dxt_trigger_time += time_taken - - if dxt_trigger_time > 0: + + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default') + "message": "Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-read.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) else: - issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_reads, - mpiio_coll_reads / total_mpiio_read_operations * 100 + issue = "Application uses MPI-IO and read data using {} ({:.2f}%) collective operations".format( + mpiio_coll_reads, mpiio_coll_reads / total_mpiio_read_operations * 100 ) insights_operation.append( @@ -1535,8 +2124,15 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot ) -def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio=None): - ''' +def check_mpi_collective_write_operation( + mpiio_coll_writes, + mpiio_indep_writes, + total_mpiio_write_operations, + detected_files, + file_map, + dxt_mpiio=None, +): + """ Check whether application uses collective mpi write calls Parameters: @@ -1547,14 +2143,18 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, independent write operations and percentage per file required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes'] file_map: file id and file name pairing - ''' + """ if mpiio_coll_writes == 0: - if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]: - thresholds['collective_operations_absolute'][1] = True - issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format( + if ( + total_mpiio_write_operations + and total_mpiio_write_operations + > thresholds["collective_operations_absolute"][0] + ): + thresholds["collective_operations_absolute"][1] = True + issue = "Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls".format( mpiio_indep_writes, - mpiio_indep_writes / total_mpiio_write_operations * 100 + mpiio_indep_writes / total_mpiio_write_operations * 100, ) detail = [] @@ -1564,62 +2164,79 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, for index, row in detected_files.iterrows(): detail.append( { - 'message': '{} ({}%) independent writes to "{}"'.format( - row['absolute_indep_writes'], - row['percent_indep_writes'], - file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])]) - ) + "message": '{} ({}%) independent writes to "{}"'.format( + row["absolute_indep_writes"], + row["percent_indep_writes"], + file_map[int(row["id"])] + if args.full_path + else os.path.basename(file_map[int(row["id"])]), + ) } ) # DXT Analysis if args.backtrace: start = time.time() - temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)] - temp = temp['write_segments'].iloc[0] - stack_memory_addresses = temp['stack_memory_addresses'].iloc[0] - address = dxt_mpiio.iloc[0]['address_line_mapping']['address'] + temp = dxt_mpiio.loc[ + (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1) + ] + temp = temp["write_segments"].iloc[0] + stack_memory_addresses = temp["stack_memory_addresses"].iloc[0] + address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"] res = set(list(address)) & set(stack_memory_addresses) - backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)] + backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[ + dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res) + ] detail.append( { - 'message': 'The backtrace information for these write call(s) is given below:' + "message": "The backtrace information for these write call(s) is given below:" } ) for index, row3 in backtrace.iterrows(): detail.append( { - 'message': '{}: {}'.format( - row3['function_name'], - row3['line_number'] - ) + "message": "{}: {}".format( + row3["function_name"], row3["line_number"] + ) } ) end = time.time() time_taken = end - start dxt_trigger_time += time_taken - + if dxt_trigger_time > 0: detail.append( { - 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5)) + "message": "Time taken to process this trigger: {}s".format( + round(dxt_trigger_time, 5) + ) } ) recommendation = [ { - 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default') + "message": "Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-collective-write.c"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail) + message( + INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, + TARGET_DEVELOPER, + HIGH, + issue, + recommendation, + detail, + ) ) else: - issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format( - mpiio_coll_writes, - mpiio_coll_writes / total_mpiio_write_operations * 100 + issue = "Application uses MPI-IO and write data using {} ({:.2f}%) collective operations".format( + mpiio_coll_writes, mpiio_coll_writes / total_mpiio_write_operations * 100 ) insights_operation.append( @@ -1627,8 +2244,10 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, ) -def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules): - ''' +def check_mpi_none_block_operation( + mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules +): + """ Check whether application can benefit from non-blocking requests Parameters: @@ -1636,93 +2255,131 @@ def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_ext mpiio_nb_writes: number of non-blocking mpi write operations has_hdf5_extension: boolean value of whether the file in in hdf5 extension modules: all different mudules been used in the application - ''' + """ if mpiio_nb_reads == 0: - issue = 'Application could benefit from non-blocking (asynchronous) reads' + issue = "Application could benefit from non-blocking (asynchronous) reads" recommendation = [] - if 'H5F' in modules or has_hdf5_extension: + if "H5F" in modules or has_hdf5_extension: recommendation.append( { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default') + "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-vol-async-read.c"), + line_numbers=True, + background_color="default", + ), } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default') + "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())', + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-iread.c"), + line_numbers=True, + background_color="default", + ), } ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + recommendation, + ) ) if mpiio_nb_writes == 0: - issue = 'Application could benefit from non-blocking (asynchronous) writes' + issue = "Application could benefit from non-blocking (asynchronous) writes" recommendation = [] - if 'H5F' in modules or has_hdf5_extension: + if "H5F" in modules or has_hdf5_extension: recommendation.append( { - 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default') + "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)", + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/hdf5-vol-async-write.c"), + line_numbers=True, + background_color="default", + ), } ) - if 'MPI-IO' in modules: + if "MPI-IO" in modules: recommendation.append( { - 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default') + "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())', + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-iwrite.c"), + line_numbers=True, + background_color="default", + ), } ) insights_operation.append( - message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation) + message( + INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, + TARGET_DEVELOPER, + WARN, + issue, + recommendation, + ) ) def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): - ''' + """ Check whether application has used inter-node aggregators Parameters: - cb_nodes: + cb_nodes: NUMBER_OF_COMPUTE_NODES: - ''' + """ if cb_nodes > NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using inter-node aggregators (which require network communication)' + issue = "Application is using inter-node aggregators (which require network communication)" recommendation = [ { - 'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format( + "message": "Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})".format( NUMBER_OF_COMPUTE_NODES ), - 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default') + "sample": Syntax.from_path( + os.path.join(ROOT, "snippets/mpi-io-hints.bash"), + line_numbers=True, + background_color="default", + ), } ] insights_operation.append( - message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation) + message( + INSIGHTS_MPI_IO_AGGREGATORS_INTER, + TARGET_USER, + HIGH, + issue, + recommendation, + ) ) if cb_nodes < NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using intra-node aggregators' + issue = "Application is using intra-node aggregators" insights_operation.append( message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue) ) if cb_nodes == NUMBER_OF_COMPUTE_NODES: - issue = 'Application is using one aggregator per compute node' + issue = "Application is using one aggregator per compute node" insights_operation.append( message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue) @@ -1731,65 +2388,75 @@ def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES): # Layout and export + def display_content(console): if insights_metadata: console.print( Panel( - Padding( - Group( - *insights_metadata - ), - (1, 1) - ), - title='METADATA', - title_align='left' + Padding(Group(*insights_metadata), (1, 1)), + title="METADATA", + title_align="left", ) ) if insights_operation: console.print( Panel( - Padding( - Group( - *insights_operation - ), - (1, 1) - ), - title='OPERATIONS', - title_align='left' + Padding(Group(*insights_operation), (1, 1)), + title="OPERATIONS", + title_align="left", ) ) if insights_dxt: console.print( Panel( - Padding( - Group( - *insights_dxt - ), - (1, 1) - ), - title='DXT', - title_align='left' + Padding(Group(*insights_dxt), (1, 1)), title="DXT", title_align="left" ) ) def display_thresholds(console): tholdMessage = { - 'imbalance_operations': 'Minimum imbalance requests ratio: [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100), - 'small_bytes': 'Minimum size of a small request: [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]), - 'small_requests': 'Maximum small requests ratio: [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100), - 'small_requests_absolute': 'Maximum small requests: [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]), - 'misaligned_requests': 'Maximum misaligned requests ratio: [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100), - 'random_operations': 'Maximum random request ratio: [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100), - 'random_operations_absolute': 'Maximum random requests: [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]), - 'metadata_time_rank': 'Maximum metadata process time per rank: [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]), - 'imbalance_size': 'Maximum read/write size difference ratio: [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100), - 'imbalance_stragglers': 'Maximum ratio difference among ranks: [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100), - 'interface_stdio': 'Maximum STDIO usage ratio: [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100), - 'collective_operations': 'Minimum MPI collective operation usage ratio: [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100), - 'collective_operations_absolute': 'Minimum MPI collective operations: [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]), + "imbalance_operations": "Minimum imbalance requests ratio: [white]{}%[/white]".format( + thresholds["imbalance_operations"][0] * 100 + ), + "small_bytes": "Minimum size of a small request: [white]{} bytes[/white]".format( + thresholds["small_bytes"][0] + ), + "small_requests": "Maximum small requests ratio: [white]{}%[/white]".format( + thresholds["small_requests"][0] * 100 + ), + "small_requests_absolute": "Maximum small requests: [white]{}[/white]".format( + thresholds["small_requests_absolute"][0] + ), + "misaligned_requests": "Maximum misaligned requests ratio: [white]{}%[/white]".format( + thresholds["misaligned_requests"][0] * 100 + ), + "random_operations": "Maximum random request ratio: [white]{}%[/white]".format( + thresholds["random_operations"][0] * 100 + ), + "random_operations_absolute": "Maximum random requests: [white]{}[/white]".format( + thresholds["random_operations_absolute"][0] + ), + "metadata_time_rank": "Maximum metadata process time per rank: [white]{} seconds[/white]".format( + thresholds["metadata_time_rank"][0] + ), + "imbalance_size": "Maximum read/write size difference ratio: [white]{}%[/white]".format( + thresholds["imbalance_size"][0] * 100 + ), + "imbalance_stragglers": "Maximum ratio difference among ranks: [white]{}%[/white]".format( + thresholds["imbalance_stragglers"][0] * 100 + ), + "interface_stdio": "Maximum STDIO usage ratio: [white]{}%[/white]".format( + thresholds["interface_stdio"][0] * 100 + ), + "collective_operations": "Minimum MPI collective operation usage ratio: [white]{}%[/white]".format( + thresholds["collective_operations"][0] * 100 + ), + "collective_operations_absolute": "Minimum MPI collective operations: [white]{}[/white]".format( + thresholds["collective_operations_absolute"][0] + ), } toBeAppend = [] @@ -1802,24 +2469,19 @@ def display_thresholds(console): toBeAppend.append(message) console.print( - Panel( - '\n'.join(toBeAppend), - title='THRESHOLDS', - title_align='left', - padding=1 - ) + Panel("\n".join(toBeAppend), title="THRESHOLDS", title_align="left", padding=1) ) def display_footer(console, insights_start_time, insights_end_time): console.print( Panel( - ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( + " {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds".format( datetime.datetime.now().year, datetime.datetime.now(), - insights_end_time - insights_start_time + insights_end_time - insights_start_time, ), - box=box.SIMPLE + box=box.SIMPLE, ) ) @@ -1828,37 +2490,28 @@ def export_html(console, export_dir, trace_name): if not args.export_html: return - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.html") - console.save_html( - filepath, - theme=set_export_theme(), - clear=False - ) + console.save_html(filepath, theme=set_export_theme(), clear=False) def export_svg(console, export_dir, trace_name): if not args.export_svg: return - - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.svg") - console.save_svg( - filepath, - title='Drishti', - theme=set_export_theme(), - clear=False - ) + console.save_svg(filepath, title="Drishti", theme=set_export_theme(), clear=False) def export_csv(export_dir, trace_name, jobid=None): if not args.export_csv: return - + issues = [ - 'JOB', + "JOB", INSIGHTS_STDIO_HIGH_USAGE, INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, INSIGHTS_POSIX_READ_COUNT_INTENSIVE, @@ -1890,23 +2543,21 @@ def export_csv(export_dir, trace_name, jobid=None): INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, INSIGHTS_MPI_IO_AGGREGATORS_INTRA, INSIGHTS_MPI_IO_AGGREGATORS_INTER, - INSIGHTS_MPI_IO_AGGREGATORS_OK + INSIGHTS_MPI_IO_AGGREGATORS_OK, ] if codes: issues.extend(codes) detected_issues = dict.fromkeys(issues, False) - detected_issues['JOB'] = jobid + detected_issues["JOB"] = jobid for report in csv_report: detected_issues[report] = True - - os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists + os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists filepath = os.path.join(export_dir, f"{trace_name}.csv") - with open(filepath, 'w') as f: + with open(filepath, "w") as f: w = csv.writer(f) w.writerow(detected_issues.keys()) w.writerow(detected_issues.values()) - diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py index 28dcd63..ed58b1d 100644 --- a/drishti/includes/parser.py +++ b/drishti/includes/parser.py @@ -1,128 +1,120 @@ import argparse -parser = argparse.ArgumentParser( - description='Drishti: ' -) +parser = argparse.ArgumentParser(description="Drishti: ") parser.add_argument( - 'log_path', - help='Input .darshan file or recorder folder' + "log_paths", nargs="+", help="Input .darshan file or recorder folder" ) parser.add_argument( - '--issues', + "--issues", default=False, - action='store_true', - dest='only_issues', - help='Only displays the detected issues and hides the recommendations' + action="store_true", + dest="only_issues", + help="Only displays the detected issues and hides the recommendations", ) parser.add_argument( - '--html', + "--html", default=False, - action='store_true', - dest='export_html', - help='Export the report as an HTML page' + action="store_true", + dest="export_html", + help="Export the report as an HTML page", ) parser.add_argument( - '--svg', + "--svg", default=False, - action='store_true', - dest='export_svg', - help='Export the report as an SVG image' + action="store_true", + dest="export_svg", + help="Export the report as an SVG image", ) parser.add_argument( - '--light', + "--light", default=False, - action='store_true', - dest='export_theme_light', - help='Use a light theme for the report when generating files' + action="store_true", + dest="export_theme_light", + help="Use a light theme for the report when generating files", ) parser.add_argument( - '--size', + "--size", default=False, - dest='export_size', - help='Console width used for the report and generated files' + dest="export_size", + help="Console width used for the report and generated files", ) parser.add_argument( - '--verbose', + "--verbose", default=False, - action='store_true', - dest='verbose', - help='Display extended details for the recommendations' + action="store_true", + dest="verbose", + help="Display extended details for the recommendations", ) parser.add_argument( - '--threshold', + "--threshold", default=False, - action='store_true', - dest='thold', - help='Display all thresholds used for the report' + action="store_true", + dest="thold", + help="Display all thresholds used for the report", ) parser.add_argument( - '--code', + "--code", default=False, - action='store_true', - dest='code', - help='Display insights identification code' + action="store_true", + dest="code", + help="Display insights identification code", ) parser.add_argument( - '--backtrace', + "--backtrace", default=False, - action='store_true', - dest='backtrace', - help='Enable DXT insights and backtrace' + action="store_true", + dest="backtrace", + help="Enable DXT insights and backtrace", ) parser.add_argument( - '--path', + "--path", default=False, - action='store_true', - dest='full_path', - help='Display the full file path for the files that triggered the issue' + action="store_true", + dest="full_path", + help="Display the full file path for the files that triggered the issue", ) parser.add_argument( - '--csv', + "--csv", default=False, - action='store_true', - dest='export_csv', - help='Export a CSV with the code of all issues that were triggered' + action="store_true", + dest="export_csv", + help="Export a CSV with the code of all issues that were triggered", ) parser.add_argument( - '--export_dir', + "--export_dir", default="", - dest='export_dir', - help='Specify the directory prefix for the output files (if any)' + dest="export_dir", + help="Specify the directory prefix for the output files (if any)", ) -parser.add_argument( - '--json', - default=False, - dest='json', - help=argparse.SUPPRESS -) +parser.add_argument("--json", default=False, dest="json", help=argparse.SUPPRESS) parser.add_argument( - '--split', + "--split", default=False, - action='store_true', - dest='split_files', - help='Split the files and generate report for each file' + action="store_true", + dest="split_files", + help="Split the files and generate report for each file", ) parser.add_argument( - '--config', + "--config", default=False, - dest='config', - help='Enable thresholds read from json file' + dest="config", + help="Enable thresholds read from json file", ) args = parser.parse_args() diff --git a/drishti/reporter.py b/drishti/reporter.py index 8455040..a6a8401 100644 --- a/drishti/reporter.py +++ b/drishti/reporter.py @@ -3,10 +3,12 @@ import os import sys from subprocess import call -from drishti.includes.parser import * +from typing import List, Optional +# from includes.parser import * # imports {'parser', 'args', 'argparse'} +from drishti.includes.parser import args -''' +""" |- handler_darshan -| | | reporter -> /handlers -> |- handler_recorder -| -| @@ -15,8 +17,7 @@ ________________________________________________| | |-----> /includes -> module -> config -> parser -''' - +""" LOG_TYPE_DARSHAN = 0 LOG_TYPE_RECORDER = 1 @@ -26,30 +27,57 @@ def clear(): """ Clear the screen with the comment call based on the operating system. """ - _ = call('clear' if os.name == 'posix' else 'cls') + _ = call("clear" if os.name == "posix" else "cls") + + +def check_log_type(paths: List[str]) -> Optional[int]: + is_darshan = True + is_recorder = True + multiple_logs = len(paths) > 1 + for path in paths: + if path.endswith(".darshan"): + if not os.path.isfile(path): + print("Unable to open .darshan file.") + sys.exit(os.EX_NOINPUT) + else: + is_darshan = True and is_darshan + is_recorder = False and is_recorder + else: # check whether is a valid recorder log + if not os.path.isdir(path): + print("Unable to open recorder folder.") + sys.exit(os.EX_NOINPUT) + else: + is_recorder = True and is_recorder + is_darshan = False and is_darshan -def check_log_type(path): - if path.endswith('.darshan'): - if not os.path.isfile(path): - print('Unable to open .darshan file.') + if multiple_logs: + if is_darshan: + return LOG_TYPE_DARSHAN + else: + print("Only .darshan files are supported for multiple logs.") sys.exit(os.EX_NOINPUT) - else: return LOG_TYPE_DARSHAN - else: # check whether is a valid recorder log - if not os.path.isdir(path): - print('Unable to open recorder folder.') + else: + if is_darshan and not is_recorder: + return LOG_TYPE_DARSHAN + elif is_recorder and not is_darshan: + return LOG_TYPE_RECORDER + else: + print("Unable to reliably determine the log type.") sys.exit(os.EX_NOINPUT) - else: return LOG_TYPE_RECORDER def main(): - log_type = check_log_type(args.log_path) - + log_type = check_log_type(args.log_paths) + if log_type == LOG_TYPE_DARSHAN: from drishti.handlers.handle_darshan import handler elif log_type == LOG_TYPE_RECORDER: from drishti.handlers.handle_recorder import handler - + handler() + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index a93a8ce..c3b9d6c 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,13 @@ 'rich==12.5.1', 'recorder-utils', ], + extras_require={ + 'dev': [ + 'ruff', + 'isort', + 'mypy' + ], + }, packages=find_packages(), package_data={ 'drishti.includes': [