diff --git a/.gitignore b/.gitignore
index d3c0162..74cfd33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+sample/tensorflow_unet3d_darshan_per_rank_workload
+
# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,pycharm,visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,pycharm,visualstudiocode
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/MypyPlugin.xml b/.idea/MypyPlugin.xml
new file mode 100644
index 0000000..ac4cd76
--- /dev/null
+++ b/.idea/MypyPlugin.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/drishti-io.iml b/.idea/drishti-io.iml
new file mode 100644
index 0000000..883789c
--- /dev/null
+++ b/.idea/drishti-io.iml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..90404e0
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..c4fcf4c
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/Sample_1.xml b/.idea/runConfigurations/Sample_1.xml
new file mode 100644
index 0000000..0bc3377
--- /dev/null
+++ b/.idea/runConfigurations/Sample_1.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/Sample_2.xml b/.idea/runConfigurations/Sample_2.xml
new file mode 100644
index 0000000..3c03139
--- /dev/null
+++ b/.idea/runConfigurations/Sample_2.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..cc1923a
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.8
diff --git a/drishti/handlers/darshan_util.py b/drishti/handlers/darshan_util.py
new file mode 100755
index 0000000..43d9cd3
--- /dev/null
+++ b/drishti/handlers/darshan_util.py
@@ -0,0 +1,1094 @@
+import csv
+import datetime
+import io
+import subprocess
+import sys
+import typing
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import cached_property
+from os import write
+from shlex import shlex
+from typing import Dict, Final, Optional, Union, List, Tuple, Iterable
+
+import numpy as np
+import pandas as pd
+from darshan import DarshanReport # type: ignore
+import drishti.includes.parser as parser
+import drishti.includes.config as config
+
+
+class ModuleType(str, Enum):
+ """Enum for standard I/O module types"""
+
+ POSIX = "POSIX"
+ STDIO = "STDIO"
+ MPIIO = "MPI-IO"
+
+ def __str__(self) -> str:
+ return self.value
+
+
+@dataclass
+class TimeSpan:
+ start: datetime.datetime
+ end: datetime.datetime
+
+ def __post_init__(self):
+ if self.start > self.end:
+ raise ValueError(
+ f"TimeSpan start ({self.start}) must be <= end ({self.end})"
+ )
+
+
+@dataclass
+class IOCounter:
+ """Base class for I/O metrics with read/write counts"""
+
+ read: Final[int] = field(init=True)
+ write: Final[int] = field(init=True)
+ _total: Optional[int] = None
+
+ @cached_property
+ def total(self) -> int:
+ """Total count, calculated once on first access"""
+ if self._total is not None:
+ return self._total
+ return self.read + self.write
+
+
+@dataclass
+class IOSize(IOCounter):
+ """Represents I/O size statistics in bytes"""
+
+ pass
+
+
+@dataclass
+class IOOperation(IOCounter):
+ """Represents I/O operation count statistics"""
+
+ pass
+
+
+@dataclass
+class IOStatistics:
+ """Tracks both I/O sizes and operations by module with aggregated metrics"""
+
+ # Use dicts to store module-specific data
+ sizes: Dict[ModuleType, IOSize] = field(init=True)
+ operations: Dict[ModuleType, IOOperation] = field(init=True)
+
+ def __post_init__(self):
+ # Initialize standard modules if not present
+ for module in ModuleType:
+ # Ensure that the module is either in both sizes and operations or in neither
+ assert (module in self.sizes) == (module in self.operations), (
+ f"Module {module} should be in both sizes and operations or in neither"
+ )
+
+ if module not in self.sizes:
+ self.sizes[module] = IOSize(read=0, write=0)
+ if module not in self.operations:
+ self.operations[module] = IOOperation(read=0, write=0)
+
+ # Convenience properties for standard modules
+ @cached_property
+ def posix_size(self) -> int:
+ return self.sizes[ModuleType.POSIX].total
+
+ @cached_property
+ def stdio_size(self) -> int:
+ return self.sizes[ModuleType.STDIO].total
+
+ @cached_property
+ def mpiio_size(self) -> int:
+ return self.sizes[ModuleType.MPIIO].total
+
+ @cached_property
+ def posix_ops(self) -> int:
+ return self.operations[ModuleType.POSIX].total
+
+ @cached_property
+ def stdio_ops(self) -> int:
+ return self.operations[ModuleType.STDIO].total
+
+ @cached_property
+ def mpiio_ops(self) -> int:
+ return self.operations[ModuleType.MPIIO].total
+
+ # Aggregated size properties
+ @cached_property
+ def read_bytes(self) -> int:
+ """Total bytes read across all modules."""
+ return sum(size.read for size in self.sizes.values())
+
+ @cached_property
+ def written_bytes(self) -> int:
+ """Total bytes written across all modules."""
+ return sum(size.write for size in self.sizes.values())
+
+ @cached_property
+ def total_bytes(self) -> int:
+ """Total bytes transferred across all modules."""
+ return self.read_bytes + self.written_bytes
+
+ # Aggregated operation properties
+ @cached_property
+ def reads(self) -> int:
+ """Total read operations across all modules."""
+ return sum(op.read for op in self.operations.values())
+
+ @cached_property
+ def writes(self) -> int:
+ """Total write operations across all modules."""
+ return sum(op.write for op in self.operations.values())
+
+ @cached_property
+ def total_ops(self) -> int:
+ """Total operations across all modules."""
+ return self.reads + self.writes
+
+ # Methods to get stats for specific modules
+ def get_module_size(
+ self,
+ module: Optional[Union[ModuleType, str]] = None,
+ data_type: Optional[str] = "total",
+ ) -> int:
+ """Get size statistics for a specific module or all modules if not specified."""
+ if module is None and data_type is None:
+ raise ValueError("Both module and data_type cannot be None")
+
+ if module:
+ if module not in self.sizes:
+ raise ValueError(f"Module {module} not found in sizes")
+ size = self.sizes[module]
+ if data_type == "read":
+ return size.read
+ elif data_type == "write":
+ return size.write
+ else: # data_type is None or "total"
+ return size.total
+ else:
+ if data_type == "read":
+ return self.read_bytes
+ elif data_type == "write":
+ return self.written_bytes
+ else: # data_type is None or "total"
+ return self.total_bytes
+
+ def get_module_ops(
+ self,
+ module: Optional[Union[ModuleType, str]] = None,
+ data_type: Optional[str] = "total",
+ ) -> int:
+ """Get operation statistics for a specific module or all modules if not specified."""
+ if module is None and data_type is None:
+ raise ValueError("Both module and data_type cannot be None")
+
+ if module:
+ if module not in self.operations:
+ raise ValueError(f"Module {module} not found in operations")
+ ops = self.operations[module]
+ if data_type == "read":
+ return ops.read
+ elif data_type == "write":
+ return ops.write
+ else: # data_type is None or "total"
+ return ops.total
+ else:
+ if data_type == "read":
+ return self.reads
+ elif data_type == "write":
+ return self.writes
+ else: # data_type is None or "total"
+ return self.total_ops
+
+
+@dataclass
+class SmallIOStats(IOCounter):
+ """Statistics for small I/O operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class SharedOpsStats(IOCounter):
+ """Statistics for shared file operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class SharedSmallOpsStats(IOCounter):
+ """Statistics for small shared file operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class ConsecutiveIOStats(IOCounter):
+ """Statistics for consecutive I/O operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class SequentialIOStats(IOCounter):
+ """Statistics for sequential I/O operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class RandomIOStats(IOCounter):
+ """Statistics for random I/O operations"""
+
+ pass # Inherits read/write/total from IOCounter
+
+
+@dataclass
+class MPIIONonBlockingStats(IOCounter):
+ """Statistics for non-blocking MPI I/O operations"""
+
+ pass
+
+
+@dataclass
+class MPICollectiveIOStats(IOCounter):
+ """Statistics for collective MPI I/O operations"""
+
+ pass
+
+
+@dataclass
+class MPIIndependentIOStats(IOCounter):
+ """Statistics for independent MPI I/O operations"""
+
+ pass
+
+
+@dataclass
+class AccessPatternStats:
+ """Statistics for I/O access patterns by pattern type"""
+
+ consecutive: ConsecutiveIOStats = field(
+ default_factory=lambda: ConsecutiveIOStats(read=0, write=0), init=True
+ )
+ sequential: SequentialIOStats = field(
+ default_factory=lambda: SequentialIOStats(read=0, write=0), init=True
+ )
+ random: RandomIOStats = field(
+ default_factory=lambda: RandomIOStats(read=0, write=0), init=True
+ )
+
+
+@dataclass
+class DarshanFile:
+ # TODO: All fields which are not calculated should be instantly populated and not optional
+ # TODO: Explore using typeddicts instead of dicts
+ file_path: str
+ _darshan_report: Optional[DarshanReport] = None
+ job_id: Optional[str] = None
+ log_ver: Optional[str] = None
+ time: Optional[TimeSpan] = None
+ exe: Optional[str] = None
+ _modules: Optional[Iterable[str]] = None
+ _name_records: Optional[Dict[int, str]] = None # Keys are uint64
+ _max_read_offset: Optional[int] = None
+ _max_write_offset: Optional[int] = None
+ total_files_stdio: Optional[int] = None
+ total_files_posix: Optional[int] = None
+ total_files_mpiio: Optional[int] = None
+ files: Optional[Dict[str, str]] = None
+
+ # Replace individual I/O stats with IOStatistics class
+ _io_stats: Optional[IOStatistics] = None
+
+ # File counts
+ total_files: Optional[int] = 0
+
+ # Additional I/O statistics organized by category
+ _posix_small_io: Optional[SmallIOStats] = None
+
+ _posix_detected_small_files: Optional[pd.DataFrame] = None
+
+ # Direct alignment fields instead of a class
+ _mem_not_aligned: Optional[int] = None
+ _file_not_aligned: Optional[int] = None
+
+ _posix_read_consecutive: Optional[int] = None
+ _posix_write_consecutive: Optional[int] = None
+ _posix_read_sequential: Optional[int] = None
+ _posix_write_sequential: Optional[int] = None
+ _posix_read_random: Optional[int] = None
+ _posix_write_random: Optional[int] = None
+
+ _posix_long_metadata_count: Optional[int] = None
+ _posix_data_stragglers_count: Optional[int] = None
+ _posix_time_stragglers_count: Optional[int] = None
+ _posix_write_imbalance_count: Optional[int] = None
+ _posix_read_imbalance_count: Optional[int] = None
+
+ access_pattern: Optional[AccessPatternStats] = None
+
+ # Use separate classes for shared operations
+ _shared_ops: Optional[SharedOpsStats] = None
+ shared_small_ops: Optional[SharedSmallOpsStats] = None
+
+ count_long_metadata: Optional[int] = None
+ posix_shared_data_imbalance_stragglers_count: Optional[int] = None
+
+ _has_hdf5_extension: Optional[bool] = None
+
+ _mpiio_nb_ops: Optional[MPIIONonBlockingStats] = None
+
+ _cb_nodes: Optional[int] = None
+ _number_of_compute_nodes: Optional[int] = None
+ hints: Optional[List[str]] = None
+
+ timestamp: Optional[TimeSpan] = None
+
+ aggregated: Optional[pd.DataFrame] = None
+
+ _mpi_coll_ops: Optional[MPICollectiveIOStats] = None
+ _mpi_indep_ops: Optional[MPIIndependentIOStats] = None
+
+ detected_files_mpi_coll_reads: Optional[pd.DataFrame] = None
+ detected_files_mpi_coll_writes: Optional[pd.DataFrame] = None
+
+ imbalance_count_posix_shared_time: Optional[int] = None
+ posix_shared_time_imbalance_detected_files: Optional[
+ Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
+ ] = None
+
+ @cached_property
+ def report(self) -> DarshanReport:
+ if self._darshan_report is None:
+ self._darshan_report = DarshanReport(self.file_path)
+ return self._darshan_report
+
+ @cached_property
+ def modules(self) -> Iterable[str]:
+ if self._modules is None:
+ self._modules = set(self.report.records.keys())
+ return self._modules
+
+ @cached_property
+ def io_stats(self) -> IOStatistics:
+ if self._io_stats is None:
+ # Calculate I/O sizes
+ sizes: Dict[ModuleType, IOSize] = {}
+ ops: Dict[ModuleType, IOOperation] = {}
+ if ModuleType.STDIO in self.modules:
+ df = self.report.records[ModuleType.STDIO].to_df()
+ counters = df["counters"]
+ assert df, "STDIO module data frame is empty"
+
+ stdio_read_size = counters["STDIO_BYTES_READ"].sum()
+ stdio_write_size = counters["STDIO_BYTES_WRITTEN"].sum()
+ sizes[ModuleType.STDIO] = IOSize(
+ read=stdio_read_size, write=stdio_write_size
+ )
+
+ stdio_read_ops = counters["STDIO_READS"].sum()
+ stdio_write_ops = counters["STDIO_WRITES"].sum()
+ ops[ModuleType.STDIO] = IOOperation(
+ read=stdio_read_ops, write=stdio_write_ops
+ )
+
+ if ModuleType.POSIX in self.modules:
+ df = self.report.records[ModuleType.POSIX].to_df()
+ counters = df["counters"]
+ assert df, "POSIX module data frame is empty"
+
+ posix_write_size = counters["POSIX_BYTES_WRITTEN"].sum()
+ posix_read_size = counters["POSIX_BYTES_READ"].sum()
+ sizes[ModuleType.POSIX] = IOSize(
+ read=posix_read_size, write=posix_write_size
+ )
+
+ posix_read_ops = counters["POSIX_READS"].sum()
+ posix_write_ops = counters["POSIX_WRITES"].sum()
+ ops[ModuleType.POSIX] = IOOperation(
+ read=posix_read_ops, write=posix_write_ops
+ )
+
+ if ModuleType.MPIIO in self.modules:
+ df = self.report.records[ModuleType.MPIIO].to_df()
+ counters = df["counters"]
+ assert df, "MPIIO module data frame is empty"
+
+ mpiio_write_size = counters["MPIIO_BYTES_WRITTEN"].sum()
+ mpiio_read_size = counters["MPIIO_BYTES_READ"].sum()
+ sizes[ModuleType.MPIIO] = IOSize(
+ read=mpiio_read_size, write=mpiio_write_size
+ )
+
+ mpiio_read_ops = counters['MPIIO_INDEP_READS'].sum() + counters['MPIIO_COLL_READS'].sum()
+ mpiio_write_ops = counters['MPIIO_INDEP_WRITES'].sum() + counters['MPIIO_COLL_WRITES'].sum()
+ ops[ModuleType.MPIIO] = IOOperation(
+ read=mpiio_read_ops, write=mpiio_write_ops
+ )
+
+ self._io_stats = IOStatistics(sizes=sizes, operations=ops)
+ return self._io_stats
+
+ @cached_property
+ def posix_small_io(self) -> SmallIOStats:
+ if self._posix_small_io is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ posix_reads_small = (
+ posix_counters["POSIX_SIZE_READ_0_100"].sum()
+ + posix_counters["POSIX_SIZE_READ_100_1K"].sum()
+ + posix_counters["POSIX_SIZE_READ_1K_10K"].sum()
+ + posix_counters["POSIX_SIZE_READ_10K_100K"].sum()
+ + posix_counters["POSIX_SIZE_READ_100K_1M"].sum()
+ )
+ posix_writes_small = (
+ posix_counters["POSIX_SIZE_WRITE_0_100"].sum()
+ + posix_counters["POSIX_SIZE_WRITE_100_1K"].sum()
+ + posix_counters["POSIX_SIZE_WRITE_1K_10K"].sum()
+ + posix_counters["POSIX_SIZE_WRITE_10K_100K"].sum()
+ + posix_counters["POSIX_SIZE_WRITE_100K_1M"].sum()
+ )
+ self._posix_small_io = SmallIOStats(
+ read=posix_reads_small, write=posix_writes_small
+ )
+ return self._posix_small_io
+
+ @property
+ def posix_detected_small_files(self) -> pd.DataFrame:
+ if self._posix_detected_small_files is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ posix_counters["INSIGHTS_POSIX_SMALL_READ"] = (
+ posix_counters["POSIX_SIZE_READ_0_100"]
+ + posix_counters["POSIX_SIZE_READ_100_1K"]
+ + posix_counters["POSIX_SIZE_READ_1K_10K"]
+ + posix_counters["POSIX_SIZE_READ_10K_100K"]
+ + posix_counters["POSIX_SIZE_READ_100K_1M"]
+ )
+ posix_counters["INSIGHTS_POSIX_SMALL_WRITE"] = (
+ posix_counters["POSIX_SIZE_WRITE_0_100"]
+ + posix_counters["POSIX_SIZE_WRITE_100_1K"]
+ + posix_counters["POSIX_SIZE_WRITE_1K_10K"]
+ + posix_counters["POSIX_SIZE_WRITE_10K_100K"]
+ + posix_counters["POSIX_SIZE_WRITE_100K_1M"]
+ )
+ detected_files = pd.DataFrame(
+ posix_counters.groupby("id")[
+ ["INSIGHTS_POSIX_SMALL_READ", "INSIGHTS_POSIX_SMALL_WRITE"]
+ ].sum()
+ ).reset_index()
+ detected_files.columns = pd.Index(["id", "total_reads", "total_writes"])
+ detected_files.loc[:, "id"] = detected_files.loc[:, "id"].astype(str)
+ self._posix_detected_small_files = detected_files
+ return self._posix_detected_small_files
+
+ @property
+ def file_map(self) -> Dict[int, str]:
+ return self.name_records
+
+ @cached_property
+ def name_records(self) -> Dict[int, str]:
+ if self._name_records is None:
+ self._name_records = self.report.name_records
+ return self._name_records
+
+ @property
+ def dxt_posix_df(self) -> Optional[pd.DataFrame]:
+ if parser.args.backtrace is False:
+ return None
+ assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+ dxt_posix_df = pd.DataFrame(self.report.records["DXT_POSIX"].to_df())
+ return dxt_posix_df
+
+ @property
+ def dxt_posix_read_df(self) -> Optional[pd.DataFrame]:
+ if parser.args.backtrace is False:
+ return None
+ assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+ df = self.dxt_posix_df
+ assert df is not None, "Should be handled by parser.args.backtrace check"
+
+ if "address_line_mapping" not in df:
+ parser.args.backtrace = False
+ return None
+
+ read_id = []
+ read_rank = []
+ read_length = []
+ read_offsets = []
+ read_end_time = []
+ read_start_time = []
+ read_operation = []
+
+ for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]):
+ if not r[1].empty:
+ read_id.append([r[3]] * len((r[1]["length"].to_list())))
+ read_rank.append([r[0]] * len((r[1]["length"].to_list())))
+ read_length.append(r[1]["length"].to_list())
+ read_end_time.append(r[1]["end_time"].to_list())
+ read_start_time.append(r[1]["start_time"].to_list())
+ read_operation.append(["read"] * len((r[1]["length"].to_list())))
+ read_offsets.append(r[1]["offset"].to_list())
+
+ read_id = [element for nestedlist in read_id for element in nestedlist]
+ read_rank = [element for nestedlist in read_rank for element in nestedlist]
+ read_length = [element for nestedlist in read_length for element in nestedlist]
+ read_offsets = [
+ element for nestedlist in read_offsets for element in nestedlist
+ ]
+ read_end_time = [
+ element for nestedlist in read_end_time for element in nestedlist
+ ]
+ read_operation = [
+ element for nestedlist in read_operation for element in nestedlist
+ ]
+ read_start_time = [
+ element for nestedlist in read_start_time for element in nestedlist
+ ]
+
+ dxt_posix_read_data = {
+ "id": read_id,
+ "rank": read_rank,
+ "length": read_length,
+ "end_time": read_end_time,
+ "start_time": read_start_time,
+ "operation": read_operation,
+ "offsets": read_offsets,
+ }
+
+ return pd.DataFrame(dxt_posix_read_data)
+
+ @property
+ def dxt_posix_write_df(self) -> Optional[pd.DataFrame]:
+ if parser.args.backtrace is False:
+ return None
+ assert "DXT_POSIX" in self.modules, "Missing DXT_POSIX module"
+ df = self.dxt_posix_df
+ assert df is not None, "Should be handled by parser.args.backtrace check"
+
+ if "address_line_mapping" not in df:
+ parser.args.backtrace = False
+ return None
+
+ write_id = []
+ write_rank = []
+ write_length = []
+ write_offsets = []
+ write_end_time = []
+ write_start_time = []
+ write_operation = []
+
+ for r in zip(df["rank"], df["read_segments"], df["write_segments"], df["id"]):
+ if not r[2].empty:
+ write_id.append([r[3]] * len((r[2]["length"].to_list())))
+ write_rank.append([r[0]] * len((r[2]["length"].to_list())))
+ write_length.append(r[2]["length"].to_list())
+ write_end_time.append(r[2]["end_time"].to_list())
+ write_start_time.append(r[2]["start_time"].to_list())
+ write_operation.append(["write"] * len((r[2]["length"].to_list())))
+ write_offsets.append(r[2]["offset"].to_list())
+
+ write_id = [element for nestedlist in write_id for element in nestedlist]
+ write_rank = [element for nestedlist in write_rank for element in nestedlist]
+ write_length = [
+ element for nestedlist in write_length for element in nestedlist
+ ]
+ write_offsets = [
+ element for nestedlist in write_offsets for element in nestedlist
+ ]
+ write_end_time = [
+ element for nestedlist in write_end_time for element in nestedlist
+ ]
+ write_operation = [
+ element for nestedlist in write_operation for element in nestedlist
+ ]
+ write_start_time = [
+ element for nestedlist in write_start_time for element in nestedlist
+ ]
+
+ dxt_posix_write_data = pd.DataFrame(
+ {
+ "id": write_id,
+ "rank": write_rank,
+ "length": write_length,
+ "end_time": write_end_time,
+ "start_time": write_start_time,
+ "operation": write_operation,
+ "offsets": write_offsets,
+ }
+ )
+
+ return pd.DataFrame(dxt_posix_write_data)
+
+ @cached_property
+ def mem_not_aligned(self) -> int:
+ if self._mem_not_aligned is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._mem_not_aligned = posix_counters["POSIX_MEM_NOT_ALIGNED"].sum()
+ return self._mem_not_aligned
+
+ @cached_property
+ def file_not_aligned(self) -> int:
+ if self._file_not_aligned is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._file_not_aligned = posix_counters["POSIX_FILE_NOT_ALIGNED"].sum()
+ return self._file_not_aligned
+
+ @property
+ def lustre_df(self) -> Optional[pd.DataFrame]:
+ if "LUSTRE" not in self.modules:
+ return None
+ lustre_dict = self.report.records["LUSTRE"].to_df()
+ assert len(lustre_dict) == 1, f"Expected 1 data frame for LUSTRE, got {len(self.report.records['LUSTRE'].to_df())}"
+ try:
+ lustre_df = lustre_dict["components"]
+ except KeyError:
+ # Using an older PyDarshan version
+ lustre_df = lustre_dict["counters"]
+ return lustre_df
+
+ @cached_property
+ def max_read_offset(self) -> int:
+ if self._max_read_offset is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._max_read_offset = posix_counters["POSIX_MAX_BYTE_READ"].max()
+ return self._max_read_offset
+
+ @cached_property
+ def max_write_offset(self) -> int:
+ if self._max_write_offset is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._max_write_offset = posix_counters["POSIX_MAX_BYTE_WRITTEN"].max()
+ return self._max_write_offset
+
+ @cached_property
+ def posix_read_consecutive(self) -> int:
+ if self._posix_read_consecutive is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_read_consecutive = posix_counters["POSIX_CONSEC_READS"].sum()
+ return self._posix_read_consecutive
+
+ @cached_property
+ def posix_write_consecutive(self) -> int:
+ if self._posix_write_consecutive is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_write_consecutive = posix_counters["POSIX_CONSEC_WRITES"].sum()
+ return self._posix_write_consecutive
+
+ @cached_property
+ def posix_read_sequential(self) -> int:
+ if self._posix_read_sequential is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_read_sequential = (
+ posix_counters["POSIX_SEQ_READS"].sum() - self.posix_read_consecutive
+ )
+ return self._posix_read_sequential
+
+ @cached_property
+ def posix_write_sequential(self) -> int:
+ if self._posix_write_sequential is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_write_sequential = (
+ posix_counters["POSIX_SEQ_WRITES"].sum() - self.posix_write_consecutive
+ )
+ return self._posix_write_sequential
+
+ @cached_property
+ def posix_read_random(self) -> int:
+ if self._posix_read_random is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_read_random = (
+ self.io_stats.get_module_ops(ModuleType.POSIX, "read")
+ - self.posix_read_consecutive
+ - self.posix_read_sequential
+ )
+ return self._posix_read_random
+
+ @cached_property
+ def posix_write_random(self) -> int:
+ if self._posix_write_random is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._posix_write_random = (
+ self.io_stats.get_module_ops(ModuleType.POSIX, "write")
+ - self.posix_write_consecutive
+ - self.posix_write_sequential
+ )
+ return self._posix_write_random
+
+ @property
+ def posix_shared_files_df(self) -> pd.DataFrame:
+ assert "POSIX" in self.modules, "Missing POSIX module"
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ shared_files_df = posix_df["counters"].loc[(posix_df["counters"]["rank"] == -1)]
+ shared_files_df = shared_files_df.assign(id=lambda d: d["id"].astype(str))
+ return shared_files_df
+
+ @cached_property
+ def posix_shared_reads(self) -> int:
+ if self._shared_ops is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._shared_ops = SharedOpsStats(
+ read=posix_counters["POSIX_SHARED_READS"].sum(),
+ write=posix_counters["POSIX_SHARED_WRITES"].sum(),
+ )
+ return self._shared_ops.read
+
+ @cached_property
+ def posix_shared_writes(self) -> int:
+ if self._shared_ops is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_counters = posix_df["counters"]
+ self._shared_ops = SharedOpsStats(
+ read=posix_counters["POSIX_SHARED_READS"].sum(),
+ write=posix_counters["POSIX_SHARED_WRITES"].sum(),
+ )
+ return self._shared_ops.write
+
+ @cached_property
+ def posix_long_metadata_count(self) -> int:
+ if self._posix_long_metadata_count is None:
+ posix_df = self.report.records[ModuleType.POSIX].to_df()
+ posix_long_metadata_rows = posix_df["fcounters"][
+ (
+ posix_df["fcounters"]["POSIX_F_META_TIME"]
+ > config.thresholds["metadata_time_rank"][0]
+ )
+ ]
+ self._posix_long_metadata_count = len(posix_long_metadata_rows)
+ return self._posix_long_metadata_count
+
+ @property
+ def posix_data_stragglers_df(self) -> pd.DataFrame:
+ shared_files = self.posix_shared_files_df
+
+ detected_files = []
+
+ for index, row in shared_files.iterrows():
+ total_transfer_size = row["POSIX_BYTES_WRITTEN"] + row["POSIX_BYTES_READ"]
+
+ if (
+ total_transfer_size
+ and abs(
+ row["POSIX_SLOWEST_RANK_BYTES"] - row["POSIX_FASTEST_RANK_BYTES"]
+ )
+ / total_transfer_size
+ > config.thresholds["imbalance_stragglers"][0]
+ ):
+ # stragglers_count += 1
+
+ detected_files.append(
+ [
+ row["id"],
+ abs(
+ row["POSIX_SLOWEST_RANK_BYTES"]
+ - row["POSIX_FASTEST_RANK_BYTES"]
+ )
+ / total_transfer_size
+ * 100,
+ ]
+ )
+
+ column_names = ["id", "data_imbalance"]
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+ return detected_files
+
+ @cached_property
+ def posix_data_stragglers_count(self) -> int:
+ if self._posix_data_stragglers_count is None:
+ self._posix_data_stragglers_count = len(self.posix_data_stragglers_df)
+ return self._posix_data_stragglers_count
+
+ @property
+ def posix_time_stragglers_df(self) -> pd.DataFrame:
+ df = self.report.records[ModuleType.POSIX].to_df()
+
+ shared_files_times = df['fcounters'].loc[(df['fcounters']['rank'] == -1)]
+
+ # Get the files responsible
+ detected_files = []
+
+ # stragglers_count = 0
+ # stragglers_imbalance = {}
+
+ shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
+
+ for index, row in shared_files_times.iterrows():
+ total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME']
+
+ if total_transfer_time and abs(
+ row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \
+ config.thresholds['imbalance_stragglers'][0]:
+ # stragglers_count += 1
+
+ detected_files.append([
+ row['id'],
+ abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
+ ])
+
+ column_names = ['id', 'time_imbalance']
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+ return detected_files
+
+ @cached_property
+ def posix_time_stragglers_count(self) -> int:
+ if self._posix_time_stragglers_count is None:
+ self._posix_time_stragglers_count = len(self.posix_time_stragglers_df)
+ return self._posix_time_stragglers_count
+
+ @property
+ def posix_write_imbalance_df(self) -> pd.DataFrame:
+ df = self.report.records[ModuleType.POSIX].to_df()
+
+ aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
+ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
+ ].groupby('id', as_index=False).agg({
+ 'rank': 'nunique',
+ 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'],
+ 'POSIX_BYTES_READ': ['sum', 'min', 'max']
+ })
+
+ aggregated.columns = list(map('_'.join, aggregated.columns.values))
+
+ aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str))
+
+ # Get the files responsible
+ imbalance_count = 0
+
+ detected_files = []
+
+ for index, row in aggregated.iterrows():
+ if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \
+ row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]:
+ imbalance_count += 1
+
+ detected_files.append([
+ row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[
+ 'POSIX_BYTES_WRITTEN_max'] * 100
+ ])
+
+ column_names = ['id', 'write_imbalance']
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+ return detected_files
+
+ @cached_property
+ def posix_write_imbalance_count(self) -> int:
+ if self._posix_write_imbalance_count is None:
+ self._posix_write_imbalance_count = len(self.posix_write_imbalance_df)
+ return self._posix_write_imbalance_count
+
+ @property
+ def posix_read_imbalance_df(self) -> pd.DataFrame:
+ df = self.report.records[ModuleType.POSIX].to_df()
+
+ aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
+ ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
+ ].groupby('id', as_index=False).agg({
+ 'rank': 'nunique',
+ 'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'],
+ 'POSIX_BYTES_READ': ['sum', 'min', 'max']
+ })
+
+ aggregated.columns = list(map('_'.join, aggregated.columns.values))
+
+ aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str))
+
+
+ imbalance_count = 0
+
+ detected_files = []
+
+ for index, row in aggregated.iterrows():
+ if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
+ 'POSIX_BYTES_READ_max'] > config.thresholds['imbalance_size'][0]:
+ imbalance_count += 1
+
+ detected_files.append([
+ row['id'],
+ abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
+ ])
+
+ column_names = ['id', 'read_imbalance']
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+ return detected_files
+
+ @cached_property
+ def posix_read_imbalance_count(self) -> int:
+ if self._posix_read_imbalance_count is None:
+ self._posix_read_imbalance_count = len(self.posix_read_imbalance_df)
+ return self._posix_read_imbalance_count
+
+ @cached_property
+ def mpi_coll_ops(self) -> MPICollectiveIOStats:
+ if self._mpi_coll_ops is None:
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ mpi_coll_reads = mpi_df['counters']['MPIIO_COLL_READS'].sum()
+ mpiio_coll_writes = mpi_df['counters']['MPIIO_COLL_WRITES'].sum()
+ self._mpi_coll_ops = MPICollectiveIOStats(read=mpi_coll_reads, write=mpiio_coll_writes)
+ return self._mpi_coll_ops
+
+ @cached_property
+ def mpi_indep_ops(self) -> MPIIndependentIOStats:
+ if self._mpi_indep_ops is None:
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ mpi_indep_reads = mpi_df['counters']['MPIIO_INDEP_READS'].sum()
+ mpi_indep_writes = mpi_df['counters']['MPIIO_INDEP_WRITES'].sum()
+ self._mpi_indep_ops = MPIIndependentIOStats(read=mpi_indep_reads, write=mpi_indep_writes)
+ return self._mpi_indep_ops
+
+ @property
+ def mpi_read_df(self) -> pd.DataFrame:
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ counters = mpi_df['counters']
+ mpi_coll_reads = self.mpi_coll_ops.read
+ mpi_total_reads = self.io_stats.get_module_ops(ModuleType.MPIIO, "read")
+
+ detected_files = []
+
+ if mpi_coll_reads == 0 and mpi_total_reads and mpi_total_reads > \
+ config.thresholds['collective_operations_absolute'][0]:
+ files = pd.DataFrame(counters.groupby('id').sum()).reset_index()
+ for index, row in counters.iterrows():
+ if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
+ row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+ config.thresholds['collective_operations'][0] and
+ (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+ config.thresholds['collective_operations_absolute'][0]):
+ detected_files.append([
+ row['id'], row['MPIIO_INDEP_READS'],
+ row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+ ])
+
+ column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+ return detected_files
+
+ @property
+ def dxt_mpi_df(self) -> Optional[pd.DataFrame]:
+ if not parser.args.backtrace:
+ return None
+ if "DXT_MPIIO" not in self.modules:
+ return None
+
+ dxt_mpiio = self.report.records["DXT_MPIIO"].to_df()
+ dxt_mpiio = pd.DataFrame(dxt_mpiio)
+ return dxt_mpiio
+
+ @property
+ def mpi_write_df(self) -> pd.DataFrame:
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ counters = mpi_df['counters']
+
+ mpi_coll_writes = self.mpi_coll_ops.write
+ total_mpiio_write_operations = self.io_stats.get_module_ops(ModuleType.MPIIO, "write")
+
+
+ detected_files = []
+ if mpi_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \
+ config.thresholds['collective_operations_absolute'][0]:
+ files = pd.DataFrame(counters.groupby('id').sum()).reset_index()
+
+ for index, row in counters.iterrows():
+ if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
+ row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+ config.thresholds['collective_operations'][0] and
+ (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+ config.thresholds['collective_operations_absolute'][0]):
+ detected_files.append([
+ row['id'], row['MPIIO_INDEP_WRITES'],
+ row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+ ])
+
+ column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
+ detected_files = pd.DataFrame(detected_files, columns=column_names)
+
+ return detected_files
+
+ @cached_property
+ def mpiio_nb_ops(self) -> MPIIONonBlockingStats:
+ if self._mpiio_nb_ops is None:
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ mpi_nb_reads = mpi_df['counters']['MPIIO_NB_READS'].sum()
+ mpi_nb_writes = mpi_df['counters']['MPIIO_NB_WRITES'].sum()
+ self._mpiio_nb_ops = MPIIONonBlockingStats(read=mpi_nb_reads, write=mpi_nb_writes)
+ return self._mpiio_nb_ops
+
+ @cached_property
+ def has_hdf5_extension(self) -> bool:
+ if self._has_hdf5_extension is None:
+ self._has_hdf5_extension = False
+ mpi_df = self.report.records[ModuleType.MPIIO].to_df()
+ # for index, row in mpi_df['counters'].iterrows(): # Implicitly converts all data to np.float64. Problematic for id (np.uint64)
+ for row in mpi_df['counters'].itertuples(index=False):
+ # if self.file_map[int(row['id'])].endswith('.h5') or self.file_map[int(row['id'])].endswith('.hdf5'):
+ if self.file_map[row.id].endswith('.h5') or self.file_map[row.id].endswith('.hdf5'):
+ self._has_hdf5_extension = True
+ break
+ return self._has_hdf5_extension
+
+ @cached_property
+ def cb_nodes(self) -> int:
+ if self._cb_nodes is None:
+ assert ModuleType.MPIIO in self.modules, "Missing MPIIO module"
+ hints = ""
+ if 'h' in self.report.metadata['job']['metadata']:
+ hints = self.report.metadata['job']['metadata']['h']
+ if hints:
+ hints = hints.split(';')
+
+ cb_nodes = None
+
+ for hint in hints:
+ if hint != 'no':
+ (key, value) = hint.split('=')
+
+ if key == 'cb_nodes':
+ cb_nodes = value
+ return self._cb_nodes
+
+ @cached_property
+ def number_of_compute_nodes(self) -> int:
+ if self._number_of_compute_nodes is None:
+ assert ModuleType.MPIIO in self.modules, "Missing MPIIO module"
+ command = 'sacct --job {} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'.format(
+ self.report.metadata['job']['jobid']
+ )
+ arguments = shlex.split(command)
+
+ try:
+ result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+ if result.returncode == 0:
+ # We have successfully fetched the information from SLURM
+ db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8')))
+
+ try:
+ first = next(db)
+
+ if 'NNodes' in first:
+ self._number_of_compute_nodes = first['NNodes']
+
+ except StopIteration:
+ pass
+ except FileNotFoundError:
+ pass
+ return self._number_of_compute_nodes
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index ea690f3..86dcf6f 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -1,18 +1,66 @@
#!/usr/bin/env python3
+import csv
+import datetime
import io
-import sys
-import time
+import os
import shlex
import shutil
import subprocess
-import pandas as pd
-import darshan
-import darshan.backend.cffi_backend as darshanll
+import sys
+import time
-from rich import print
+import darshan # type: ignore
+import darshan.backend.cffi_backend as darshanll # type: ignore
+import numpy as np
+import pandas as pd
from packaging import version
-from drishti.includes.module import *
+from rich import print
+from rich.padding import Padding
+from rich.panel import Panel
+
+from drishti.handlers.darshan_util import DarshanFile, ModuleType
+
+from drishti.includes.config import (
+ HIGH,
+ RECOMMENDATIONS,
+ WARN,
+ init_console,
+ insights_total,
+ thresholds,
+)
+
+# from drishti.includes.module import *
+import drishti.includes.module as module
+
+# from drishti.includes.module import (
+# check_individual_read_imbalance,
+# check_individual_write_imbalance,
+# check_long_metadata,
+# check_misaligned,
+# check_mpi_aggregator,
+# check_mpi_collective_read_operation,
+# check_mpi_collective_write_operation,
+# check_mpi_none_block_operation,
+# check_mpiio,
+# check_operation_intensive,
+# check_random_operation,
+# check_shared_data_imblance,
+# check_shared_small_operation,
+# check_shared_time_imbalance,
+# check_size_intensive,
+# check_small_operation,
+# check_stdio,
+# check_traffic,
+# display_content,
+# display_footer,
+# display_thresholds,
+# export_csv,
+# export_html,
+# export_svg,
+# )
+import drishti.includes.parser as parser
+# from drishti.includes.parser import args
def is_available(name):
@@ -75,7 +123,8 @@ def handler():
insights_start_time = time.time()
- log = darshanll.log_open(args.log_path)
+ darshan_log_path = parser.args.log_paths[0]
+ log = darshanll.log_open(darshan_log_path)
modules = darshanll.log_get_modules(log)
@@ -88,8 +137,8 @@ def handler():
library_version = darshanll.get_lib_version()
# Make sure log format is of the same version
- filename = args.log_path
- # check_log_version(console, args.log_path, log_version, library_version)
+ filename = darshan_log_path
+ # check_log_version(console, darshan_log_path, log_version, library_version)
darshanll.log_close(log)
@@ -99,6 +148,9 @@ def handler():
job = report.metadata
+ #########################################################################################################################################################################
+ darshan_file_obj = DarshanFile(file_path=darshan_log_path)
+
#########################################################################################################################################################################
# Check usage of STDIO, POSIX, and MPI-IO per file
@@ -156,13 +208,12 @@ def handler():
df_lustre = None
if "LUSTRE" in report.records:
df_lustre = report.records['LUSTRE'].to_df()
-
- if args.backtrace:
+ if parser.args.backtrace:
if "DXT_POSIX" in report.records:
dxt_posix = report.records["DXT_POSIX"].to_df()
dxt_posix = pd.DataFrame(dxt_posix)
if "address_line_mapping" not in dxt_posix:
- args.backtrace = False
+ parser.args.backtrace = False
else:
read_id = []
read_rank = []
@@ -290,8 +341,10 @@ def handler():
'mpiio': uses_mpiio
}
- check_stdio(total_size, total_size_stdio)
- check_mpiio(modules)
+ # module.check_stdio(total_size, total_size_stdio)
+ module.check_stdio(total_size=darshan_file_obj.io_stats.total_bytes, total_size_stdio=darshan_file_obj.io_stats.stdio_size)
+ # module.check_mpiio(modules)
+ module.check_mpiio(modules=darshan_file_obj.modules)
#########################################################################################################################################################################
@@ -305,17 +358,27 @@ def handler():
total_writes = df['counters']['POSIX_WRITES'].sum()
# Get total number of I/O operations
- total_operations = total_writes + total_reads
-
- # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
- check_operation_intensive(total_operations, total_reads, total_writes)
+ total_operations = total_writes + total_reads
+
+ # To check whether the application is write-intensive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
+ # module.check_operation_intensive(total_operations, total_reads, total_writes)
+ module.check_operation_intensive(
+ total_operations=darshan_file_obj.io_stats.posix_ops,
+ total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"),
+ total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"),
+ )
total_read_size = df['counters']['POSIX_BYTES_READ'].sum()
total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum()
total_size = total_written_size + total_read_size
- check_size_intensive(total_size, total_read_size, total_written_size)
+ # module.check_size_intensive(total_size, total_read_size, total_written_size)
+ module.check_size_intensive(
+ total_size=darshan_file_obj.io_stats.posix_size,
+ total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"),
+ total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"),
+ )
#########################################################################################################################################################################
@@ -359,7 +422,19 @@ def handler():
detected_files.columns = ['id', 'total_reads', 'total_writes']
detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
- check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+
+ # module.check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ module.check_small_operation(
+ total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "read"),
+ total_reads_small=darshan_file_obj.posix_small_io.read,
+ total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, "write"),
+ total_writes_small=darshan_file_obj.posix_small_io.write,
+ detected_files=darshan_file_obj.posix_detected_small_files, modules=darshan_file_obj.modules,
+ file_map=darshan_file_obj.file_map,
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+ dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+ )
#########################################################################################################################################################################
@@ -368,7 +443,17 @@ def handler():
total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
- check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+ # module.check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+ module.check_misaligned(
+ total_operations=darshan_file_obj.io_stats.posix_ops,
+ total_mem_not_aligned=darshan_file_obj.mem_not_aligned,
+ total_file_not_aligned=darshan_file_obj.file_not_aligned,
+ modules=darshan_file_obj.modules,
+ file_map=darshan_file_obj.file_map,
+ df_lustre=darshan_file_obj.lustre_df,
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+ )
#########################################################################################################################################################################
@@ -377,7 +462,16 @@ def handler():
max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max()
max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
- check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ # module.check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ module.check_traffic(
+ max_read_offset=darshan_file_obj.max_read_offset,
+ total_read_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "read"),
+ max_write_offset=darshan_file_obj.max_write_offset,
+ total_written_size=darshan_file_obj.io_stats.get_module_size(ModuleType.POSIX, "write"),
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+ dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+ )
#########################################################################################################################################################################
@@ -402,7 +496,30 @@ def handler():
write_random = total_writes - write_consecutive - write_sequential
#print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
- check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+
+ assert read_consecutive == darshan_file_obj.posix_read_consecutive
+ assert read_sequential == darshan_file_obj.posix_read_sequential
+ assert read_random == darshan_file_obj.posix_read_random, f"{read_random} != {darshan_file_obj.posix_read_random}"
+ assert total_reads == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"), f"{total_reads} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX, 'read')}"
+ assert write_consecutive == darshan_file_obj.posix_write_consecutive
+ assert write_sequential == darshan_file_obj.posix_write_sequential
+ assert write_random == darshan_file_obj.posix_write_random
+ assert total_writes == darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write")
+
+ # module.check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ module.check_random_operation(
+ read_consecutive=darshan_file_obj.posix_read_consecutive,
+ read_sequential=darshan_file_obj.posix_read_sequential,
+ read_random=darshan_file_obj.posix_read_random,
+ total_reads=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"read"),
+ write_consecutive=darshan_file_obj.posix_write_consecutive,
+ write_sequential=darshan_file_obj.posix_write_sequential,
+ write_random=darshan_file_obj.posix_write_random,
+ total_writes=darshan_file_obj.io_stats.get_module_ops(ModuleType.POSIX,"write"),
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df,
+ dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df,
+ )
#########################################################################################################################################################################
@@ -413,6 +530,7 @@ def handler():
shared_files = shared_files.assign(id=lambda d: d['id'].astype(str))
if not shared_files.empty:
+ # TODO: This entire conditional
total_shared_reads = shared_files['POSIX_READS'].sum()
total_shared_reads_small = (
shared_files['POSIX_SIZE_READ_0_100'].sum() +
@@ -448,16 +566,22 @@ def handler():
shared_files['POSIX_SIZE_WRITE_100K_1M']
)
- check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
+ # module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
+ assert total_shared_reads == darshan_file_obj.posix_shared_reads
+ sys.exit(2)
+ module.check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
#########################################################################################################################################################################
count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
- check_long_metadata(count_long_metadata, modules)
+ assert darshan_file_obj.posix_long_metadata_count == count_long_metadata
+ assert darshan_file_obj.modules == modules.keys(), f"{darshan_file_obj.modules} != {modules.keys()}"
+ # module.check_long_metadata(count_long_metadata, modules)
+ module.check_long_metadata(count_long_metadata=darshan_file_obj.posix_long_metadata_count, modules=darshan_file_obj.modules)
# We already have a single line for each shared-file access
- # To check for stragglers, we can check the difference between the
+ # To check for stragglers, we can check the difference between the
# POSIX_FASTEST_RANK_BYTES
# POSIX_SLOWEST_RANK_BYTES
@@ -482,7 +606,21 @@ def handler():
column_names = ['id', 'data_imbalance']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ assert stragglers_count == darshan_file_obj.posix_data_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_data_stragglers_count}"
+ assert detected_files.equals(darshan_file_obj.posix_data_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_data_stragglers_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+ assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+ assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+ assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}"
+ # module.check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+ module.check_shared_data_imblance(
+ stragglers_count=darshan_file_obj.posix_data_stragglers_count,
+ detected_files=darshan_file_obj.posix_data_stragglers_df,
+ file_map=darshan_file_obj.file_map,
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data = darshan_file_obj.dxt_posix_read_df,
+ dxt_posix_write_data = darshan_file_obj.dxt_posix_write_df
+ )
# POSIX_F_FASTEST_RANK_TIME
# POSIX_F_SLOWEST_RANK_TIME
@@ -494,7 +632,7 @@ def handler():
detected_files = []
stragglers_count = 0
- stragglers_imbalance = {}
+ # stragglers_imbalance = {}
shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
@@ -510,7 +648,17 @@ def handler():
column_names = ['id', 'time_imbalance']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+
+ assert stragglers_count == darshan_file_obj.posix_time_stragglers_count, f"{stragglers_count} != {darshan_file_obj.posix_time_stragglers_count}"
+ assert detected_files.equals(darshan_file_obj.posix_time_stragglers_df), f"{detected_files} != {darshan_file_obj.posix_time_stragglers_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+
+ # module.check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+ module.check_shared_time_imbalance(
+ stragglers_count=darshan_file_obj.posix_time_stragglers_count,
+ detected_files=darshan_file_obj.posix_time_stragglers_df,
+ file_map=darshan_file_obj.file_map,
+ )
aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
@@ -539,7 +687,22 @@ def handler():
column_names = ['id', 'write_imbalance']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
+
+ assert imbalance_count == darshan_file_obj.posix_write_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_write_imbalance_count}"
+ assert detected_files.equals(darshan_file_obj.posix_write_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_write_imbalance_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+ assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+ assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+ assert dxt_posix_write_data == darshan_file_obj.dxt_posix_write_df, f"{dxt_posix_write_data} != {darshan_file_obj.dxt_posix_write_df}"
+
+ # module.check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
+ module.check_individual_write_imbalance(
+ imbalance_count=darshan_file_obj.posix_write_imbalance_count,
+ detected_files=darshan_file_obj.posix_write_imbalance_df,
+ file_map=darshan_file_obj.file_map,
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_write_data=darshan_file_obj.dxt_posix_write_df
+ )
imbalance_count = 0
@@ -555,7 +718,21 @@ def handler():
column_names = ['id', 'read_imbalance']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
+
+ assert imbalance_count == darshan_file_obj.posix_read_imbalance_count, f"{imbalance_count} != {darshan_file_obj.posix_read_imbalance_count}"
+ assert detected_files.equals(darshan_file_obj.posix_read_imbalance_df), f"{detected_files} != {darshan_file_obj.posix_read_imbalance_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+ assert dxt_posix == darshan_file_obj.dxt_posix_df, f"{dxt_posix} != {darshan_file_obj.dxt_posix_df}"
+ assert dxt_posix_read_data == darshan_file_obj.dxt_posix_read_df, f"{dxt_posix_read_data} != {darshan_file_obj.dxt_posix_read_df}"
+
+ # module.check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
+ module.check_individual_read_imbalance(
+ imbalance_count=darshan_file_obj.posix_read_imbalance_count,
+ detected_files=darshan_file_obj.posix_read_imbalance_df,
+ file_map=darshan_file_obj.file_map,
+ dxt_posix=darshan_file_obj.dxt_posix_df,
+ dxt_posix_read_data=darshan_file_obj.dxt_posix_read_df
+ )
#########################################################################################################################################################################
@@ -590,7 +767,30 @@ def handler():
column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+ assert mpiio_coll_reads == darshan_file_obj.mpi_coll_ops.read, f"{mpiio_coll_reads} != {darshan_file_obj.mpi_coll_ops.read}"
+ assert mpiio_indep_reads == darshan_file_obj.mpi_indep_ops.read, f"{mpiio_indep_reads} != {darshan_file_obj.mpi_indep_ops.read}"
+ assert total_mpiio_read_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"), f"{total_mpiio_read_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'read')}"
+ if detected_files.empty:
+ assert detected_files.empty, f"{detected_files} != {darshan_file_obj.mpi_read_df}"
+ assert darshan_file_obj.mpi_read_df.empty, f"{darshan_file_obj.mpi_read_df} != {detected_files}"
+ else:
+ assert detected_files.equals(darshan_file_obj.mpi_read_df), f"{detected_files} != {darshan_file_obj.mpi_read_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+ if dxt_mpiio is None:
+ assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+ assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}"
+ else:
+ assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+
+ # module.check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+ module.check_mpi_collective_read_operation(
+ mpiio_coll_reads=darshan_file_obj.mpi_coll_ops.read,
+ mpiio_indep_reads=darshan_file_obj.mpi_indep_ops.read,
+ total_mpiio_read_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "read"),
+ detected_files=darshan_file_obj.mpi_read_df,
+ file_map=darshan_file_obj.file_map,
+ dxt_mpiio=darshan_file_obj.dxt_mpi_df
+ )
df_mpiio_collective_writes = df_mpiio['counters'] #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
@@ -615,7 +815,30 @@ def handler():
column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
detected_files = pd.DataFrame(detected_files, columns=column_names)
- check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+ assert mpiio_indep_writes == darshan_file_obj.mpi_indep_ops.write, f"{mpiio_indep_writes} != {darshan_file_obj.mpi_indep_ops.write}"
+ assert mpiio_coll_writes == darshan_file_obj.mpi_coll_ops.write, f"{mpiio_coll_writes} != {darshan_file_obj.mpi_coll_ops.write}"
+ assert total_mpiio_write_operations == darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"), f"{total_mpiio_write_operations} != {darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, 'write')}"
+ if detected_files.empty:
+ assert detected_files.empty, f"{detected_files} !={darshan_file_obj.mpi_write_df}"
+ assert darshan_file_obj.mpi_write_df.empty, f"{darshan_file_obj.mpi_write_df} != {detected_files}"
+ else:
+ assert detected_files.equals(darshan_file_obj.mpi_write_df), f"{detected_files} != {darshan_file_obj.mpi_write_df}"
+ assert file_map == darshan_file_obj.file_map, f"{file_map} != {darshan_file_obj.file_map}"
+ if dxt_mpiio is None:
+ assert dxt_mpiio is None, f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+ assert darshan_file_obj.dxt_mpi_df is None, f"{darshan_file_obj.dxt_mpi_df} != {dxt_mpiio}"
+ else:
+ assert dxt_mpiio.equals(darshan_file_obj.dxt_mpi_df), f"{dxt_mpiio} != {darshan_file_obj.dxt_mpi_df}"
+
+ # module.check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+ module.check_mpi_collective_write_operation(
+ mpiio_coll_writes=darshan_file_obj.mpi_coll_ops.write,
+ mpiio_indep_writes=darshan_file_obj.mpi_indep_ops.write,
+ total_mpiio_write_operations=darshan_file_obj.io_stats.get_module_ops(ModuleType.MPIIO, "write"),
+ detected_files=darshan_file_obj.mpi_write_df,
+ file_map=darshan_file_obj.file_map,
+ dxt_mpiio=darshan_file_obj.dxt_mpi_df,
+ )
#########################################################################################################################################################################
@@ -632,7 +855,18 @@ def handler():
mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum()
mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum()
- check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+ assert mpiio_nb_reads == darshan_file_obj.mpiio_nb_ops.read
+ assert mpiio_nb_writes == darshan_file_obj.mpiio_nb_ops.write
+ assert modules.keys() == darshan_file_obj.modules, f"{modules.keys()} != {darshan_file_obj.modules}"
+ assert has_hdf5_extension == darshan_file_obj.has_hdf5_extension, f"{has_hdf5_extension} != {darshan_file_obj.has_hdf5_extension}"
+
+ # module.check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+ module.check_mpi_none_block_operation(
+ mpiio_nb_reads=darshan_file_obj.mpiio_nb_ops.read,
+ mpiio_nb_writes=darshan_file_obj.mpiio_nb_ops.write,
+ has_hdf5_extension=darshan_file_obj.has_hdf5_extension,
+ modules=darshan_file_obj.modules,
+ )
#########################################################################################################################################################################
@@ -680,8 +914,14 @@ def handler():
if 'NNodes' in first:
NUMBER_OF_COMPUTE_NODES = first['NNodes']
+ assert cb_nodes == darshan_file_obj.cb_nodes, f"{cb_nodes} != {darshan_file_obj.cb_nodes}"
+ assert NUMBER_OF_COMPUTE_NODES == darshan_file_obj.number_of_compute_nodes, f"{NUMBER_OF_COMPUTE_NODES} != {darshan_file_obj.number_of_compute_nodes}"
# Do we have one MPI-IO aggregator per node?
- check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
+ # module.check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
+ module.check_mpi_aggregator(
+ cb_nodes=darshan_file_obj.cb_nodes,
+ NUMBER_OF_COMPUTE_NODES=darshan_file_obj.number_of_compute_nodes
+ )
except StopIteration:
pass
except FileNotFoundError:
@@ -711,7 +951,7 @@ def handler():
job['exe'].split()[0]
),
' [b]DARSHAN[/b]: [white]{}[/white]'.format(
- os.path.basename(args.log_path)
+ os.path.basename(darshan_log_path)
),
' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format(
job_start,
@@ -748,14 +988,14 @@ def handler():
console.print()
- display_content(console)
- display_thresholds(console)
- display_footer(console, insights_start_time, insights_end_time)
+ module.display_content(console)
+ module.display_thresholds(console)
+ module.display_footer(console, insights_start_time, insights_end_time)
# Export to HTML, SVG, and CSV
- trace_name = os.path.basename(args.log_path).replace('.darshan', '')
- out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
+ trace_name = os.path.basename(darshan_log_path).replace('.darshan', '')
+ out_dir = parser.args.export_dir if parser.args.export_dir != "" else os.getcwd()
- export_html(console, out_dir, trace_name)
- export_svg(console, out_dir, trace_name)
- export_csv(out_dir, trace_name, job['job']['jobid'])
+ module.export_html(console, out_dir, trace_name)
+ module.export_svg(console, out_dir, trace_name)
+ module.export_csv(out_dir, trace_name, job['job']['jobid'])
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index 9c2df16..52fac10 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1,137 +1,211 @@
#!/usr/bin/env python3
-import datetime
import csv
+import datetime
+import os
import time
+import typing
+
import pandas as pd
from rich import box
+from rich.console import Group
+from rich.padding import Padding
+from rich.panel import Panel
from rich.syntax import Syntax
-from drishti.includes.config import *
-'''
+from drishti.includes.config import *
+from drishti.includes.config import (
+ HIGH,
+ INFO,
+ OK,
+ ROOT,
+ TARGET_DEVELOPER,
+ TARGET_USER,
+ WARN,
+ codes,
+ convert_bytes,
+ csv_report,
+ insights_dxt,
+ insights_metadata,
+ insights_operation,
+ message,
+ set_export_theme,
+ thresholds,
+)
+from drishti.includes.parser import args
+
+"""
Before calling the functions below
Make sure the variables passed are in the given structure:
file_map: a dict of (id, path) pair
modules: a set or a dict should be ok
detected_files: A pandas dataframe
-'''
+"""
# Basic usage check
+
def check_stdio(total_size, total_size_stdio):
- '''
+ """
Check whether the application has excessively utilized standard input/output operations
Parameters:
total_size: total I/O size
total_size_stdio: total STDIO size
-
- '''
-
- if total_size and total_size_stdio / total_size > thresholds['interface_stdio'][0]:
- thresholds['interface_stdio'][1] = True
- issue = 'Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})'.format(
- total_size_stdio / total_size * 100.0,
- convert_bytes(total_size_stdio)
+
+ """
+
+ if total_size and total_size_stdio / total_size > thresholds["interface_stdio"][0]:
+ thresholds["interface_stdio"][1] = True
+ issue = "Application is using STDIO, a low-performance interface, for {:.2f}% of its data transfers ({})".format(
+ total_size_stdio / total_size * 100.0, convert_bytes(total_size_stdio)
)
recommendation = [
{
- 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO'
+ "message": "Consider switching to a high-performance I/O interface such as MPI-IO"
}
]
insights_operation.append(
- message(INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_STDIO_HIGH_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation
+ )
)
-def check_mpiio(modules):
- '''
+def check_mpiio(modules: typing.Iterable[str]):
+ """
Check whether the application has used MPI-IO or not
Parameter:
modules: all different mudules been used in the application
- '''
+ """
- if 'MPI-IO' not in modules:
- issue = 'Application is using low-performance interface'
+ if "MPI-IO" not in modules:
+ issue = "Application is using low-performance interface"
recommendation = [
{
- 'message': 'Consider switching to a high-performance I/O interface such as MPI-IO'
+ "message": "Consider switching to a high-performance I/O interface such as MPI-IO"
}
]
insights_operation.append(
- message(INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+ message(
+ INSIGHTS_MPI_IO_NO_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation
+ )
)
-
# POSIX level check
def check_operation_intensive(total_operations, total_reads, total_writes):
- '''
+ """
Check whether the application is read or write intensive
Parameters:
total_operations: number of I/O operations been executed by the application
total_reads: number of read operations been executed by the application
total_writes: number of write operations been executed by the application
- '''
-
- if total_writes > total_reads and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]:
- issue = 'Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format(
- total_writes / total_operations * 100.0, total_reads / total_operations * 100.0
+ """
+
+ if (
+ total_writes > total_reads
+ and total_operations
+ and abs(total_writes - total_reads) / total_operations
+ > thresholds["imbalance_operations"][0]
+ ):
+ issue = "Application is write operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format(
+ total_writes / total_operations * 100.0,
+ total_reads / total_operations * 100.0,
)
insights_metadata.append(
- message(INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+ message(
+ INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
+ TARGET_DEVELOPER,
+ INFO,
+ issue,
+ None,
+ )
)
- if total_reads > total_writes and total_operations and abs(total_writes - total_reads) / total_operations > thresholds['imbalance_operations'][0]:
- issue = 'Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)'.format(
- total_writes / total_operations * 100.0, total_reads / total_operations * 100.0
+ if (
+ total_reads > total_writes
+ and total_operations
+ and abs(total_writes - total_reads) / total_operations
+ > thresholds["imbalance_operations"][0]
+ ):
+ issue = "Application is read operation intensive ({:.2f}% writes vs. {:.2f}% reads)".format(
+ total_writes / total_operations * 100.0,
+ total_reads / total_operations * 100.0,
)
insights_metadata.append(
- message(INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+ message(
+ INSIGHTS_POSIX_READ_COUNT_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+ )
)
def check_size_intensive(total_size, total_read_size, total_written_size):
- '''
+ """
Check whether the application is read size intensive or written size intensive
Parameters:
total_size: Total I/O size measured in byte
total_read_size: Input I/O size measured in byte
total_written_size: Output I/O size measured in byte
- '''
-
- if total_written_size > total_read_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]:
- issue = 'Application is write size intensive ({:.2f}% write vs. {:.2f}% read)'.format(
- total_written_size / total_size * 100.0, total_read_size / total_size * 100.0
+ """
+
+ if (
+ total_written_size > total_read_size
+ and abs(total_written_size - total_read_size) / total_size
+ > thresholds["imbalance_operations"][0]
+ ):
+ issue = "Application is write size intensive ({:.2f}% write vs. {:.2f}% read)".format(
+ total_written_size / total_size * 100.0,
+ total_read_size / total_size * 100.0,
)
insights_metadata.append(
- message(INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+ message(
+ INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+ )
)
- if total_read_size > total_written_size and abs(total_written_size - total_read_size) / total_size > thresholds['imbalance_operations'][0]:
- issue = 'Application is read size intensive ({:.2f}% write vs. {:.2f}% read)'.format(
- total_written_size / total_size * 100.0, total_read_size / total_size * 100.0
+ if (
+ total_read_size > total_written_size
+ and abs(total_written_size - total_read_size) / total_size
+ > thresholds["imbalance_operations"][0]
+ ):
+ issue = "Application is read size intensive ({:.2f}% write vs. {:.2f}% read)".format(
+ total_written_size / total_size * 100.0,
+ total_read_size / total_size * 100.0,
)
insights_metadata.append(
- message(INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None)
+ message(
+ INSIGHTS_POSIX_READ_SIZE_INTENSIVE, TARGET_DEVELOPER, INFO, issue, None
+ )
)
-def check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
- '''
+def check_small_operation(
+ total_reads,
+ total_reads_small,
+ total_writes,
+ total_writes_small,
+ detected_files,
+ modules,
+ file_map,
+ dxt_posix=None,
+ dxt_posix_read_data=None,
+ dxt_posix_write_data=None,
+):
+ """
Check whether application has performed an excessive number of small operations
Parameters:
@@ -139,17 +213,21 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
total_reads_small: number of read operations that has small size
total_writes: number of write operations been executed by the application
total_writes_small: number of write operations that has small size
- detected_files:
+ detected_files:
total_reads and total_writes in each file
required columns: ['id', 'total_reads', 'total_writes']
modules: all different mudules been used in the application
file_map: file id and file name pairing
df_posix: all POSIX records
- '''
-
- if total_reads_small and total_reads_small / total_reads > thresholds['small_requests'][0] and total_reads_small > thresholds['small_requests_absolute'][0]:
- thresholds['small_requests_absolute'][1] = True
- issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests'.format(
+ """
+
+ if (
+ total_reads_small
+ and total_reads_small / total_reads > thresholds["small_requests"][0]
+ and total_reads_small > thresholds["small_requests_absolute"][0]
+ ):
+ thresholds["small_requests_absolute"][1] = True
+ issue = "Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read requests".format(
total_reads_small, total_reads_small / total_reads * 100.0
)
@@ -159,63 +237,93 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
dxt_trigger_time = 0
for index, row in detected_files.iterrows():
- if row['total_reads'] > (total_reads * thresholds['small_requests'][0] / 2):
+ if row["total_reads"] > (total_reads * thresholds["small_requests"][0] / 2):
detail.append(
{
- 'message': '{} ({:.2f}%) small read requests are to "{}"'.format(
- row['total_reads'],
- row['total_reads'] / total_reads * 100.0,
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({:.2f}%) small read requests are to "{}"'.format(
+ row["total_reads"],
+ row["total_reads"] / total_reads * 100.0,
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
- temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
- if not temp_df.empty:
- temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]]
- small_read_requests_ranks = temp_df['rank'].unique()
- if len(small_read_requests_ranks) > 0:
- if len(small_read_requests_ranks) > 1 and int(small_read_requests_ranks[0]) == 0:
- rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[1]))]
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+ temp_df = dxt_posix_read_data.loc[
+ dxt_posix_read_data["id"] == int(row["id"])
+ ]
+
+ if not temp_df.empty:
+ temp_df = temp_df.loc[
+ temp_df["length"] < thresholds["small_requests"][0]
+ ]
+ small_read_requests_ranks = temp_df["rank"].unique()
+ if len(small_read_requests_ranks) > 0:
+ if (
+ len(small_read_requests_ranks) > 1
+ and int(small_read_requests_ranks[0]) == 0
+ ):
+ rank_df = temp.loc[
+ (
+ temp["rank"]
+ == int(small_read_requests_ranks[1])
+ )
+ ]
else:
- rank_df = temp.loc[(temp['rank'] == int(small_read_requests_ranks[0]))]
-
- rank_df = rank_df['read_segments'].iloc[0]
- rank_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ rank_df = temp.loc[
+ (
+ temp["rank"]
+ == int(small_read_requests_ranks[0])
+ )
+ ]
+
+ rank_df = rank_df["read_segments"].iloc[0]
+ rank_addresses = rank_df["stack_memory_addresses"].iloc[
+ 0
+ ]
+ address = dxt_posix.iloc[0]["address_line_mapping"][
+ "address"
+ ]
res = set(list(address)) & set(rank_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-
+ backtrace = dxt_posix.iloc[0][
+ "address_line_mapping"
+ ].loc[
+ dxt_posix.iloc[0]["address_line_mapping"][
+ "address"
+ ].isin(res)
+ ]
+
if len(small_read_requests_ranks) > 0:
detail.append(
{
- 'message': '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format(
+ "message": '{} rank(s) made small read requests in "{}". Below is the backtrace information:'.format(
len(small_read_requests_ranks),
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
-
+
for index, row in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row['function_name'],
- row['line_number']
- )
+ "message": "{}: {}".format(
+ row["function_name"], row["line_number"]
+ )
}
)
file_count += 1
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
)
@@ -223,40 +331,57 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
time_taken = end - start
dxt_trigger_time += time_taken
- if dxt_trigger_time > 0:
+ if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation.append(
{
- 'message': 'Consider buffering read operations into larger more contiguous ones'
+ "message": "Consider buffering read operations into larger more contiguous ones"
}
)
- if 'MPI-IO' in modules:
+ if "MPI-IO" in modules:
recommendation.append(
{
- 'message': 'Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+ "message": "Since the appplication already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_read_all() or MPI_File_read_at_all()) to aggregate requests into larger ones",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
else:
recommendation.append(
{
- 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations'
+ "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations"
}
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
- if total_writes_small and total_writes_small / total_writes > thresholds['small_requests'][0] and total_writes_small > thresholds['small_requests_absolute'][0]:
- thresholds['small_requests_absolute'][1] = True
- issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests'.format(
+ if (
+ total_writes_small
+ and total_writes_small / total_writes > thresholds["small_requests"][0]
+ and total_writes_small > thresholds["small_requests_absolute"][0]
+ ):
+ thresholds["small_requests_absolute"][1] = True
+ issue = "Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all write requests".format(
total_writes_small, total_writes_small / total_writes * 100.0
)
@@ -266,106 +391,162 @@ def check_small_operation(total_reads, total_reads_small, total_writes, total_wr
recommendation = []
file_count = 0
for index, row in detected_files.iterrows():
- if row['total_writes'] > (total_writes * thresholds['small_requests'][0] / 2):
+ if row["total_writes"] > (
+ total_writes * thresholds["small_requests"][0] / 2
+ ):
detail.append(
{
- 'message': '{} ({:.2f}%) small write requests are to "{}"'.format(
- row['total_writes'],
- row['total_writes'] / total_writes * 100.0,
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({:.2f}%) small write requests are to "{}"'.format(
+ row["total_writes"],
+ row["total_writes"] / total_writes * 100.0,
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
- temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
-
- if not temp_df.empty:
- temp_df = temp_df.loc[temp_df['length'] < thresholds['small_requests'][0]]
- small_write_requests_ranks = temp_df['rank'].unique()
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+ temp_df = dxt_posix_write_data.loc[
+ dxt_posix_write_data["id"] == int(row["id"])
+ ]
+
+ if not temp_df.empty:
+ temp_df = temp_df.loc[
+ temp_df["length"] < thresholds["small_requests"][0]
+ ]
+ small_write_requests_ranks = temp_df["rank"].unique()
if len(small_write_requests_ranks) > 0:
- if int(small_write_requests_ranks[0]) == 0 and len(small_write_requests_ranks) > 1:
- rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[1]))]
+ if (
+ int(small_write_requests_ranks[0]) == 0
+ and len(small_write_requests_ranks) > 1
+ ):
+ rank_df = temp.loc[
+ (
+ temp["rank"]
+ == int(small_write_requests_ranks[1])
+ )
+ ]
else:
- rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))]
-
- rank_df = temp.loc[(temp['rank'] == int(small_write_requests_ranks[0]))]
- rank_df = rank_df['write_segments'].iloc[0]
- rank_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ rank_df = temp.loc[
+ (
+ temp["rank"]
+ == int(small_write_requests_ranks[0])
+ )
+ ]
+
+ rank_df = temp.loc[
+ (temp["rank"] == int(small_write_requests_ranks[0]))
+ ]
+ rank_df = rank_df["write_segments"].iloc[0]
+ rank_addresses = rank_df["stack_memory_addresses"].iloc[
+ 0
+ ]
+ address = dxt_posix.iloc[0]["address_line_mapping"][
+ "address"
+ ]
res = set(list(address)) & set(rank_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-
+ backtrace = dxt_posix.iloc[0][
+ "address_line_mapping"
+ ].loc[
+ dxt_posix.iloc[0]["address_line_mapping"][
+ "address"
+ ].isin(res)
+ ]
+
if len(small_write_requests_ranks) > 0:
detail.append(
{
- 'message': '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format(
+ "message": '{} rank(s) made small write requests in "{}". Below is the backtrace information:'.format(
len(small_write_requests_ranks),
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
-
+
for index, row in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row['function_name'],
- row['line_number']
- )
+ "message": "{}: {}".format(
+ row["function_name"], row["line_number"]
+ )
}
)
-
+
file_count += 1
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to previous files'
+ "message": "The backtrace information for this file is similar to previous files"
}
)
end = time.time()
time_taken = end - start
dxt_trigger_time += time_taken
-
+
if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation.append(
{
- 'message': 'Consider buffering write operations into larger more contiguous ones'
+ "message": "Consider buffering write operations into larger more contiguous ones"
}
)
- if 'MPI-IO' in modules:
+ if "MPI-IO" in modules:
recommendation.append(
{
- 'message': 'Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+ "message": "Since the application already uses MPI-IO, consider using collective I/O calls (e.g. MPI_File_write_all() or MPI_File_write_at_all()) to aggregate requests into larger ones",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
else:
recommendation.append(
{
- 'message': 'Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations'
+ "message": "Application does not use MPI-IO for operations, consider use this interface instead to harness collective operations"
}
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
-def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map=None, df_lustre=None, dxt_posix=None, dxt_posix_read_data=None):
- '''
+def check_misaligned(
+ total_operations,
+ total_mem_not_aligned,
+ total_file_not_aligned,
+ modules,
+ file_map=None,
+ df_lustre=None,
+ dxt_posix=None,
+ dxt_posix_read_data=None,
+):
+ """
Check whether application has excessive misaligned operations
Parameters:
@@ -373,62 +554,80 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
total_mem_not_aligned: number of memory requests not aligned
total_file_not_aligned: number of file requests not aligned
modules: all different mudules been used in the application
- '''
-
- if total_operations and total_mem_not_aligned / total_operations > thresholds['misaligned_requests'][0]:
- thresholds['misaligned_requests'][1] = True
- issue = 'Application has a high number ({:.2f}%) of misaligned memory requests'.format(
+ """
+
+ if (
+ total_operations
+ and total_mem_not_aligned / total_operations
+ > thresholds["misaligned_requests"][0]
+ ):
+ thresholds["misaligned_requests"][1] = True
+ issue = "Application has a high number ({:.2f}%) of misaligned memory requests".format(
total_mem_not_aligned / total_operations * 100.0
)
insights_metadata.append(
- message(INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE, TARGET_DEVELOPER, HIGH, issue, None)
+ message(
+ INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ None,
+ )
)
- if total_operations and total_file_not_aligned / total_operations > thresholds['misaligned_requests'][0]:
- thresholds['misaligned_requests'][1] = True
- issue = 'Application issues a high number ({:.2f}%) of misaligned file requests'.format(
+ if (
+ total_operations
+ and total_file_not_aligned / total_operations
+ > thresholds["misaligned_requests"][0]
+ ):
+ thresholds["misaligned_requests"][1] = True
+ issue = "Application issues a high number ({:.2f}%) of misaligned file requests".format(
total_file_not_aligned / total_operations * 100.0
)
recommendation = [
{
- 'message': 'Consider aligning the requests to the file system block boundaries'
+ "message": "Consider aligning the requests to the file system block boundaries"
}
]
- if 'HF5' in modules:
+ if "HF5" in modules:
recommendation.append(
{
- 'message': 'Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-alignment.c'), line_numbers=True, background_color='default')
+ "message": "Since the appplication uses HDF5, consider using H5Pset_alignment() in a file access property list",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/hdf5-alignment.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment'
- }
+ "message": "Any file object greater than or equal in size to threshold bytes will be aligned on an address which is a multiple of alignment"
+ },
)
detail = []
- if 'LUSTRE' in modules:
+ if "LUSTRE" in modules:
# DXT Analysis
if args.backtrace:
start = time.time()
-
- if not df_lustre['counters']['LUSTRE_STRIPE_SIZE'].empty:
- stripe_size = df_lustre['counters']['LUSTRE_STRIPE_SIZE'].iloc[0]
+
+ if not df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].empty:
+ stripe_size = df_lustre["counters"]["LUSTRE_STRIPE_SIZE"].iloc[0]
else:
- stripe_size = df_lustre['counters']['POSIX_FILE_ALIGNMENT'].iloc[0]
+ stripe_size = df_lustre["counters"]["POSIX_FILE_ALIGNMENT"].iloc[0]
file_count = 0
ids = dxt_posix.id.unique().tolist()
for id in ids:
- temp = dxt_posix.loc[dxt_posix['id'] == id]
- temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
+ temp = dxt_posix.loc[dxt_posix["id"] == id]
+ temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
misaligned_ranks = []
misaligned_ranks_opr = []
-
+
offsets = temp_df["offsets"].to_numpy().tolist()
rank = temp_df["rank"].to_numpy().tolist()
operation = temp_df["operation"].to_numpy().tolist()
@@ -441,33 +640,46 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
if misaligned_ranks:
misaligned_rank_ind = misaligned_ranks[0]
misaligned_rank_opr = misaligned_ranks_opr[0]
- misaligned_rank_df = temp.loc[(temp['rank'] == int(misaligned_rank_ind))]
- if misaligned_rank_opr == 'read':
- misaligned_rank_df = misaligned_rank_df['read_segments'].iloc[0]
+ misaligned_rank_df = temp.loc[
+ (temp["rank"] == int(misaligned_rank_ind))
+ ]
+ if misaligned_rank_opr == "read":
+ misaligned_rank_df = misaligned_rank_df[
+ "read_segments"
+ ].iloc[0]
else:
- misaligned_rank_df = misaligned_rank_df['write_segments'].iloc[0]
- misaligned_rank_stack_addresses = misaligned_rank_df['stack_memory_addresses'].iloc[0]
-
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ misaligned_rank_df = misaligned_rank_df[
+ "write_segments"
+ ].iloc[0]
+ misaligned_rank_stack_addresses = misaligned_rank_df[
+ "stack_memory_addresses"
+ ].iloc[0]
+
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(misaligned_rank_stack_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
detail.append(
{
- 'message': '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format(
+ "message": '{} rank(s) made misaligned requests in "{}". Below is the backtrace information:'.format(
len(misaligned_ranks),
- file_map[id] if args.full_path else os.path.basename(file_map[id])
- )
+ file_map[id]
+ if args.full_path
+ else os.path.basename(file_map[id]),
+ )
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
file_count += 1
@@ -476,23 +688,43 @@ def check_misaligned(total_operations, total_mem_not_aligned, total_file_not_ali
time_taken = end - start
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(time_taken, 5)
+ )
}
)
recommendation.append(
{
- 'message': 'Consider using a Lustre alignment that matches the file system stripe configuration',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+ "message": "Consider using a Lustre alignment that matches the file system stripe configuration",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
insights_metadata.append(
- message(INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
-def check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
- '''
+def check_traffic(
+ max_read_offset,
+ total_read_size,
+ max_write_offset,
+ total_written_size,
+ dxt_posix=None,
+ dxt_posix_read_data=None,
+ dxt_posix_write_data=None,
+):
+ """
Check whether application has redundant read or write traffic
Parameters:
@@ -500,10 +732,10 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
total_read_size: total size application has been read
max_write_offset: max offset application is writing to
total_written_size: total size application has been written
- '''
+ """
if max_read_offset > total_read_size:
- issue = 'Application might have redundant read traffic (more data read than the highest offset)'
+ issue = "Application might have redundant read traffic (more data read than the highest offset)"
detail = []
file_count = 0
@@ -513,67 +745,79 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
start = time.time()
ids = dxt_posix.id.unique().tolist()
for id in ids:
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == id]
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == id]
random_ranks_ind = -1
- temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
+ temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
for i in range(len(updated_offsets)):
- if updated_offsets.count(updated_offsets[i]) > 1:
+ if updated_offsets.count(updated_offsets[i]) > 1:
redundant_ranks_ind = i
break
if random_ranks_ind != -1:
- random_rank = temp_df.iloc[redundant_ranks_ind]['rank']
- random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets']
- random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-
- temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
- temp_random_rank = temp_random_rank['read_segments'].iloc[0]
- random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
- random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ random_rank = temp_df.iloc[redundant_ranks_ind]["rank"]
+ random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"]
+ random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+ temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+ temp_random_rank = temp_random_rank["read_segments"].iloc[0]
+ random_stack_addresses = temp_random_rank.loc[
+ (temp_random_rank["offset"] == random_offsets)
+ & (temp_random_rank["start_time"] == random_start_time)
+ ]
+ random_stack_addresses = random_stack_addresses[
+ "stack_memory_addresses"
+ ].iloc[0]
+
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(random_stack_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
+
detail.append(
{
- 'message': 'The backtrace information for these redundant read call(s) is given below:'
+ "message": "The backtrace information for these redundant read call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
file_count += 1
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
)
end = time.time()
time_taken = end - start
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(time_taken, 5)
+ )
}
)
insights_metadata.append(
- message(INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None)
+ message(
+ INSIGHTS_POSIX_REDUNDANT_READ_USAGE, TARGET_DEVELOPER, WARN, issue, None
+ )
)
if max_write_offset > total_written_size:
- issue = 'Application might have redundant write traffic (more data written than the highest offset)'
+ issue = "Application might have redundant write traffic (more data written than the highest offset)"
detail = []
file_count = 0
@@ -583,70 +827,105 @@ def check_traffic(max_read_offset, total_read_size, max_write_offset, total_writ
start = time.time()
ids = dxt_posix.id.unique().tolist()
for id in ids:
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == id]
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == id]
random_ranks_ind = -1
- temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id]
+ temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id]
updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
for i in range(len(updated_offsets)):
- if updated_offsets.count(updated_offsets[i]) > 1:
+ if updated_offsets.count(updated_offsets[i]) > 1:
redundant_ranks_ind = i
break
if random_ranks_ind != -1:
- random_rank = temp_df.iloc[redundant_ranks_ind]['rank']
- random_offsets = temp_df.iloc[redundant_ranks_ind]['offsets']
- random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-
- temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
- temp_random_rank = temp_random_rank['write_segments'].iloc[0]
- random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
- random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ random_rank = temp_df.iloc[redundant_ranks_ind]["rank"]
+ random_offsets = temp_df.iloc[redundant_ranks_ind]["offsets"]
+ random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+ temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+ temp_random_rank = temp_random_rank["write_segments"].iloc[0]
+ random_stack_addresses = temp_random_rank.loc[
+ (temp_random_rank["offset"] == random_offsets)
+ & (temp_random_rank["start_time"] == random_start_time)
+ ]
+ random_stack_addresses = random_stack_addresses[
+ "stack_memory_addresses"
+ ].iloc[0]
+
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(random_stack_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
-
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
+
detail.append(
{
- 'message': 'The backtrace information for these redundant write call(s) is given below:'
+ "message": "The backtrace information for these redundant write call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
file_count += 1
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
)
end = time.time()
time_taken = end - start
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(time_taken, 5)
+ )
}
)
insights_metadata.append(
- message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None, detail)
+ message(
+ INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ WARN,
+ issue,
+ None,
+ detail,
+ )
)
insights_metadata.append(
- message(INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, None)
+ message(
+ INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ WARN,
+ issue,
+ None,
+ )
)
-def check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
- '''
+def check_random_operation(
+ read_consecutive,
+ read_sequential,
+ read_random,
+ total_reads,
+ write_consecutive,
+ write_sequential,
+ write_random,
+ total_writes,
+ dxt_posix=None,
+ dxt_posix_read_data=None,
+ dxt_posix_write_data=None,
+):
+ """
Check whether application has performed excessive random operations
Parameters:
@@ -658,19 +937,23 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
write_sequential: number of sequential write operations
write_random: number of random write operations
total_write: number of write operations been executed by the application
- '''
+ """
if total_reads:
- if read_random and read_random / total_reads > thresholds['random_operations'][0] and read_random > thresholds['random_operations_absolute'][0]:
- thresholds['random_operations'][1] = True
- thresholds['random_operations_absolute'][1] = True
- issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format(
+ if (
+ read_random
+ and read_random / total_reads > thresholds["random_operations"][0]
+ and read_random > thresholds["random_operations_absolute"][0]
+ ):
+ thresholds["random_operations"][1] = True
+ thresholds["random_operations_absolute"][1] = True
+ issue = "Application is issuing a high number ({}) of random read operations ({:.2f}%)".format(
read_random, read_random / total_reads * 100.0
)
recommendation = [
{
- 'message': 'Consider changing your data model to have consecutive or sequential reads'
+ "message": "Consider changing your data model to have consecutive or sequential reads"
}
]
@@ -679,11 +962,11 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
start = time.time()
ids = dxt_posix.id.unique().tolist()
for id in ids:
- temp = dxt_posix.loc[dxt_posix['id'] == id]
- temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == id]
- temp_df = temp_df.sort_values('start_time', ascending=True)
+ temp = dxt_posix.loc[dxt_posix["id"] == id]
+ temp_df = dxt_posix_read_data.loc[dxt_posix_read_data["id"] == id]
+ temp_df = temp_df.sort_values("start_time", ascending=True)
random_ranks_ind = -1
-
+
if not temp_df["offsets"].is_monotonic_increasing:
updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
cur = 0
@@ -694,64 +977,90 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
cur = updated_offsets[i]
if random_ranks_ind != -1:
- random_rank = temp_df.iloc[random_ranks_ind]['rank']
- random_offsets = temp_df.iloc[random_ranks_ind]['offsets']
- random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
- temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
- temp_random_rank = temp_random_rank['read_segments'].iloc[0]
- random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
- random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ random_rank = temp_df.iloc[random_ranks_ind]["rank"]
+ random_offsets = temp_df.iloc[random_ranks_ind]["offsets"]
+ random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+ temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+ temp_random_rank = temp_random_rank["read_segments"].iloc[0]
+ random_stack_addresses = temp_random_rank.loc[
+ (temp_random_rank["offset"] == random_offsets)
+ & (temp_random_rank["start_time"] == random_start_time)
+ ]
+ random_stack_addresses = random_stack_addresses[
+ "stack_memory_addresses"
+ ].iloc[0]
+
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(random_stack_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
detail = []
detail.append(
{
- 'message': 'The backtrace information for these random read call(s) is given below:'
+ "message": "The backtrace information for these random read call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
end = time.time()
time_taken = end - start
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(time_taken, 5)
+ )
}
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
else:
- issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests'.format(
+ issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) read requests".format(
read_consecutive / total_reads * 100.0,
- read_sequential / total_reads * 100.0
+ read_sequential / total_reads * 100.0,
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE, TARGET_DEVELOPER, OK, issue, None)
+ message(
+ INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE,
+ TARGET_DEVELOPER,
+ OK,
+ issue,
+ None,
+ )
)
if total_writes:
- if write_random and write_random / total_writes > thresholds['random_operations'][0] and write_random > thresholds['random_operations_absolute'][0]:
- thresholds['random_operations'][1] = True
- thresholds['random_operations_absolute'][1] = True
- issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format(
+ if (
+ write_random
+ and write_random / total_writes > thresholds["random_operations"][0]
+ and write_random > thresholds["random_operations_absolute"][0]
+ ):
+ thresholds["random_operations"][1] = True
+ thresholds["random_operations_absolute"][1] = True
+ issue = "Application is issuing a high number ({}) of random write operations ({:.2f}%)".format(
write_random, write_random / total_writes * 100.0
)
recommendation = [
{
- 'message': 'Consider changing your data model to have consecutive or sequential writes'
+ "message": "Consider changing your data model to have consecutive or sequential writes"
}
]
@@ -760,10 +1069,10 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
start = time.time()
ids = dxt_posix.id.unique().tolist()
for id in ids:
- temp = dxt_posix.loc[dxt_posix['id'] == id]
+ temp = dxt_posix.loc[dxt_posix["id"] == id]
- temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == id]
- temp_df.sort_values('start_time', ascending=True, inplace=True)
+ temp_df = dxt_posix_write_data.loc[dxt_posix_write_data["id"] == id]
+ temp_df.sort_values("start_time", ascending=True, inplace=True)
random_ranks_ind = -1
if not temp_df["offsets"].is_monotonic_increasing:
updated_offsets = (temp_df["offsets"].to_numpy()).tolist()
@@ -775,58 +1084,87 @@ def check_random_operation(read_consecutive, read_sequential, read_random, total
cur = updated_offsets[i]
if random_ranks_ind != -1:
- random_rank = temp_df.iloc[random_ranks_ind]['rank']
- random_offsets = temp_df.iloc[random_ranks_ind]['offsets']
- random_start_time = temp_df.iloc[random_ranks_ind]['start_time']
-
- temp_random_rank = temp.loc[(temp['rank'] == int(random_rank))]
- temp_random_rank = temp_random_rank['write_segments'].iloc[0]
- random_stack_addresses = temp_random_rank.loc[(temp_random_rank['offset'] == random_offsets) & (temp_random_rank['start_time'] == random_start_time)]
- random_stack_addresses = random_stack_addresses['stack_memory_addresses'].iloc[0]
-
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ random_rank = temp_df.iloc[random_ranks_ind]["rank"]
+ random_offsets = temp_df.iloc[random_ranks_ind]["offsets"]
+ random_start_time = temp_df.iloc[random_ranks_ind]["start_time"]
+
+ temp_random_rank = temp.loc[(temp["rank"] == int(random_rank))]
+ temp_random_rank = temp_random_rank["write_segments"].iloc[0]
+ random_stack_addresses = temp_random_rank.loc[
+ (temp_random_rank["offset"] == random_offsets)
+ & (temp_random_rank["start_time"] == random_start_time)
+ ]
+ random_stack_addresses = random_stack_addresses[
+ "stack_memory_addresses"
+ ].iloc[0]
+
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(random_stack_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
detail = []
detail.append(
{
- 'message': 'The backtrace information for these random write call(s) is given below:'
+ "message": "The backtrace information for these random write call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
-
+
end = time.time()
time_taken = end - start
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(time_taken, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(time_taken, 5)
+ )
}
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
else:
- issue = 'Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests'.format(
+ issue = "Application mostly uses consecutive ({:.2f}%) and sequential ({:.2f}%) write requests".format(
write_consecutive / total_writes * 100.0,
- write_sequential / total_writes * 100.0
+ write_sequential / total_writes * 100.0,
)
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE, TARGET_DEVELOPER, OK, issue, None)
+ message(
+ INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ OK,
+ issue,
+ None,
+ )
)
-def check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map):
- '''
+def check_shared_small_operation(
+ total_shared_reads,
+ total_shared_reads_small,
+ total_shared_writes,
+ total_shared_writes_small,
+ shared_files,
+ file_map,
+):
+ """
Check whether there are excessive small requests in shared files
Parameters:
@@ -838,113 +1176,182 @@ def check_shared_small_operation(total_shared_reads, total_shared_reads_small, t
small reads an small writes in each shared file
required columns: ['id', 'INSIGHTS_POSIX_SMALL_READS', 'INSIGHTS_POSIX_SMALL_WRITES']
file_map: file id and file name pairing
- '''
-
- if total_shared_reads and total_shared_reads_small / total_shared_reads > thresholds['small_requests'][0] and total_shared_reads_small > thresholds['small_requests_absolute'][0]:
- thresholds['small_requests'][1] = True
- thresholds['small_requests_absolute'][1] = True
- issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format(
- total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0
+ """
+
+ if (
+ total_shared_reads
+ and total_shared_reads_small / total_shared_reads
+ > thresholds["small_requests"][0]
+ and total_shared_reads_small > thresholds["small_requests_absolute"][0]
+ ):
+ thresholds["small_requests"][1] = True
+ thresholds["small_requests_absolute"][1] = True
+ issue = "Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests".format(
+ total_shared_reads_small,
+ total_shared_reads_small / total_shared_reads * 100.0,
)
detail = []
for index, row in shared_files.iterrows():
- if row['INSIGHTS_POSIX_SMALL_READS'] > (total_shared_reads * thresholds['small_requests'][0] / 2):
+ if row["INSIGHTS_POSIX_SMALL_READS"] > (
+ total_shared_reads * thresholds["small_requests"][0] / 2
+ ):
detail.append(
{
- 'message': '{} ({:.2f}%) small read requests are to "{}"'.format(
- row['INSIGHTS_POSIX_SMALL_READS'],
- row['INSIGHTS_POSIX_SMALL_READS'] / total_shared_reads * 100.0,
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({:.2f}%) small read requests are to "{}"'.format(
+ row["INSIGHTS_POSIX_SMALL_READS"],
+ row["INSIGHTS_POSIX_SMALL_READS"]
+ / total_shared_reads
+ * 100.0,
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
recommendation = [
{
- 'message': 'Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+ "message": "Consider coalesceing read requests into larger more contiguous ones using MPI-IO collective operations",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
]
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
- if total_shared_writes and total_shared_writes_small / total_shared_writes > thresholds['small_requests'][0] and total_shared_writes_small > thresholds['small_requests_absolute'][0]:
- thresholds['small_requests'][1] = True
- thresholds['small_requests_absolute'][1] = True
- issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format(
- total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0
+ if (
+ total_shared_writes
+ and total_shared_writes_small / total_shared_writes
+ > thresholds["small_requests"][0]
+ and total_shared_writes_small > thresholds["small_requests_absolute"][0]
+ ):
+ thresholds["small_requests"][1] = True
+ thresholds["small_requests_absolute"][1] = True
+ issue = "Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests".format(
+ total_shared_writes_small,
+ total_shared_writes_small / total_shared_writes * 100.0,
)
detail = []
for index, row in shared_files.iterrows():
- if row['INSIGHTS_POSIX_SMALL_WRITES'] > (total_shared_writes * thresholds['small_requests'][0] / 2):
+ if row["INSIGHTS_POSIX_SMALL_WRITES"] > (
+ total_shared_writes * thresholds["small_requests"][0] / 2
+ ):
detail.append(
{
- 'message': '{} ({:.2f}%) small writes requests are to "{}"'.format(
- row['INSIGHTS_POSIX_SMALL_WRITES'],
- row['INSIGHTS_POSIX_SMALL_WRITES'] / total_shared_writes * 100.0,
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({:.2f}%) small writes requests are to "{}"'.format(
+ row["INSIGHTS_POSIX_SMALL_WRITES"],
+ row["INSIGHTS_POSIX_SMALL_WRITES"]
+ / total_shared_writes
+ * 100.0,
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
recommendation = [
{
- 'message': 'Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+ "message": "Consider coalescing write requests into larger more contiguous ones using MPI-IO collective operations",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
]
insights_operation.append(
- message(INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
def check_long_metadata(count_long_metadata, modules):
- '''
+ """
Check how many ranks have metadata operations taking too long
Parameters:
count_long_metadata: number of ranks that have metadata operations taking too long
modules: all different mudules been used in the application
- '''
+ """
if count_long_metadata > 0:
- thresholds['metadata_time_rank'][1] = True
- issue = 'There are {} ranks where metadata operations take over {} seconds'.format(
- count_long_metadata, thresholds['metadata_time_rank'][0]
+ thresholds["metadata_time_rank"][1] = True
+ issue = (
+ "There are {} ranks where metadata operations take over {} seconds".format(
+ count_long_metadata, thresholds["metadata_time_rank"][0]
+ )
)
recommendation = [
{
- 'message': 'Attempt to combine files, reduce, or cache metadata operations'
+ "message": "Attempt to combine files, reduce, or cache metadata operations"
}
]
- if 'HF5' in modules:
+ if "HF5" in modules:
recommendation.append(
{
- 'message': 'Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-collective-metadata.c'), line_numbers=True, background_color='default')
+ "message": "Since your appplication uses HDF5, try enabling collective metadata calls with H5Pset_coll_metadata_write() and H5Pset_all_coll_metadata_ops()",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/hdf5-collective-metadata.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'Since your appplication uses HDF5, try using metadata cache to defer metadata operations',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-cache.c'), line_numbers=True, background_color='default')
- }
+ "message": "Since your appplication uses HDF5, try using metadata cache to defer metadata operations",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/hdf5-cache.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
+ },
)
insights_metadata.append(
- message(INSIGHTS_POSIX_HIGH_METADATA_TIME, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_HIGH_METADATA_TIME,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
-def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None, dxt_posix_write_data=None):
- '''
+def check_shared_data_imblance(
+ stragglers_count,
+ detected_files,
+ file_map,
+ dxt_posix=None,
+ dxt_posix_read_data=None,
+ dxt_posix_write_data=None,
+):
+ """
Check how many shared files containing data transfer imbalance
Parameters:
@@ -953,11 +1360,11 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
data imbalance per file
required columns: ['id', 'data_imbalance']
file_map: file id and file name pairing
- '''
+ """
if stragglers_count:
- thresholds['imbalance_stragglers'][1] = True
- issue = 'Detected data transfer imbalance caused by stragglers when accessing {} shared file.'.format(
+ thresholds["imbalance_stragglers"][1] = True
+ issue = "Detected data transfer imbalance caused by stragglers when accessing {} shared file.".format(
stragglers_count
)
@@ -968,52 +1375,73 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
- row['data_imbalance'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+ row["data_imbalance"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
- temp_df_1 = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
- temp_df_2 = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
- df_merged = pd.concat([temp_df_1, temp_df_2], ignore_index=True, sort=False)
- df_merged['duration'] = df_merged['end_time'] - df_merged['start_time']
- df_merged.sort_values('duration', ascending=True, inplace=True)
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+ temp_df_1 = dxt_posix_write_data.loc[
+ dxt_posix_write_data["id"] == int(row["id"])
+ ]
+ temp_df_2 = dxt_posix_read_data.loc[
+ dxt_posix_read_data["id"] == int(row["id"])
+ ]
+
+ df_merged = pd.concat(
+ [temp_df_1, temp_df_2], ignore_index=True, sort=False
+ )
+ df_merged["duration"] = (
+ df_merged["end_time"] - df_merged["start_time"]
+ )
+ df_merged.sort_values("duration", ascending=True, inplace=True)
df_merged = df_merged.iloc[0]
- rank_df = temp.loc[(temp['rank'] == int(df_merged['rank']))]
-
- if df_merged['operation'] == 'write':
- rank_df = rank_df['write_segments'].iloc[0]
- stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ rank_df = temp.loc[(temp["rank"] == int(df_merged["rank"]))]
+
+ if df_merged["operation"] == "write":
+ rank_df = rank_df["write_segments"].iloc[0]
+ stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[
+ 0
+ ]
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
else:
- rank_df = rank_df['read_segments'].iloc[0]
- stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ rank_df = rank_df["read_segments"].iloc[0]
+ stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[
+ 0
+ ]
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(
+ res
+ )
+ ]
detail.append(
{
- 'message': 'The backtrace information for these imbalanced call(s) is given below:'
+ "message": "The backtrace information for these imbalanced call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
@@ -1021,69 +1449,94 @@ def check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_p
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
)
-
+
end = time.time()
time_taken = end - start
dxt_trigger_time += time_taken
-
- if dxt_trigger_time > 0:
+
+ if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
- }
+ "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_SIZE_IMBALANCE,
+ TARGET_USER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
-def check_shared_data_imblance_split(slowest_rank_bytes, fastest_rank_bytes, total_transfer_size):
- '''
+def check_shared_data_imblance_split(
+ slowest_rank_bytes, fastest_rank_bytes, total_transfer_size
+):
+ """
Check whether the specific shared file contains data imbalance
Parameters:
slowest_rank_bytes: the total request size of the rank that takes the longest data operation time
fastest_rank_bytes: the total request size of the rank that takes the shortest data operation time
total_transfer_size: total request size of that specific shared file
- '''
-
- if total_transfer_size and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size > thresholds['imbalance_stragglers'][0]:
- thresholds['imbalance_stragglers'][1] = True
- issue = 'Load imbalance of {:.2f}% detected'.format(
+ """
+
+ if (
+ total_transfer_size
+ and abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size
+ > thresholds["imbalance_stragglers"][0]
+ ):
+ thresholds["imbalance_stragglers"][1] = True
+ issue = "Load imbalance of {:.2f}% detected".format(
abs(slowest_rank_bytes - fastest_rank_bytes) / total_transfer_size * 100
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
- }
+ "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_SIZE_IMBALANCE, TARGET_USER, HIGH, issue, recommendation
+ )
)
def check_shared_time_imbalance(stragglers_count, detected_files, file_map):
- '''
+ """
Check how many shared files containing time transfer imbalance
Parameters:
@@ -1092,74 +1545,101 @@ def check_shared_time_imbalance(stragglers_count, detected_files, file_map):
data imbalance per file
required columns: ['id', 'time_imbalance']
file_map: file id and file name pairing
- '''
+ """
if stragglers_count:
- thresholds['imbalance_stragglers'][1] = True
- issue = 'Detected time imbalance caused by stragglers when accessing {} shared file.'.format(
+ thresholds["imbalance_stragglers"][1] = True
+ issue = "Detected time imbalance caused by stragglers when accessing {} shared file.".format(
stragglers_count
)
detail = []
-
+
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
- row['time_imbalance'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+ row["time_imbalance"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
recommendation = [
{
- 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give
+ "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give
},
{
- 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
- }
+ "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_TIME_IMBALANCE,
+ TARGET_USER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
-def check_shared_time_imbalance_split(slowest_rank_time, fastest_rank_time, total_transfer_time):
- '''
+def check_shared_time_imbalance_split(
+ slowest_rank_time, fastest_rank_time, total_transfer_time
+):
+ """
Check whether the specific shared file contains time imbalance
Parameters:
slowest_rank_bytes: the total request time of the rank that takes the longest data operation time
fastest_rank_bytes: the total request time of the rank that takes the shortest data operation time
total_transfer_size: total request time of that specific shared file
- '''
-
- if total_transfer_time and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time > thresholds['imbalance_stragglers'][0]:
- thresholds['imbalance_stragglers'][1] = True
- issue = 'Load imbalance of {:.2f}% detected'.format(
+ """
+
+ if (
+ total_transfer_time
+ and abs(slowest_rank_time - fastest_rank_time) / total_transfer_time
+ > thresholds["imbalance_stragglers"][0]
+ ):
+ thresholds["imbalance_stragglers"][1] = True
+ issue = "Load imbalance of {:.2f}% detected".format(
abs(slowest_rank_time - fastest_rank_time) / total_transfer_time * 100
)
recommendation = [
{
- 'message': 'Consider better distributing the data in the parallel file system' # needs to review what suggestion to give
+ "message": "Consider better distributing the data in the parallel file system" # needs to review what suggestion to give
},
{
- 'message': 'Consider tuning how your data is distributed in the file system by changing the stripe size and count',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
- }
+ "message": "Consider tuning how your data is distributed in the file system by changing the stripe size and count",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_TIME_IMBALANCE, TARGET_USER, HIGH, issue, recommendation
+ )
)
-def check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None):
- '''
+def check_individual_write_imbalance(
+ imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_write_data=None
+):
+ """
Check how many write imbalance when accessing individual files
Parameters:
@@ -1167,57 +1647,62 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map,
detected_files:
write imbalance per file
required columns: ['id', 'write_imbalance']
- '''
+ """
if imbalance_count:
- thresholds['imbalance_size'][1] = True
- issue = 'Detected write imbalance when accessing {} individual files'.format(
+ thresholds["imbalance_size"][1] = True
+ issue = "Detected write imbalance when accessing {} individual files".format(
imbalance_count
)
detail = []
file_count = 0
dxt_trigger_time = 0
-
+
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
- row['write_imbalance'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+ row["write_imbalance"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
- temp_df = dxt_posix_write_data.loc[dxt_posix_write_data['id'] == int(row['id'])]
-
- maxClm = temp_df['length'].max()
- temp_df = temp_df.loc[(temp_df['length'] == maxClm)]
- rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))]
-
- rank_df = rank_df['write_segments'].iloc[0]
- stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+ temp_df = dxt_posix_write_data.loc[
+ dxt_posix_write_data["id"] == int(row["id"])
+ ]
+
+ maxClm = temp_df["length"].max()
+ temp_df = temp_df.loc[(temp_df["length"] == maxClm)]
+ rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))]
+
+ rank_df = rank_df["write_segments"].iloc[0]
+ stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0]
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res)
+ ]
detail.append(
{
- 'message': 'The backtrace information for these imbalanced write call(s) is given below:'
+ "message": "The backtrace information for these imbalanced write call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
@@ -1225,82 +1710,119 @@ def check_individual_write_imbalance(imbalance_count, detected_files, file_map,
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
- )
-
+ )
+
end = time.time()
time_taken = end - start
- dxt_trigger_time += time_taken
-
- if dxt_trigger_time > 0:
+ dxt_trigger_time += time_taken
+
+ if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning the stripe size and count to better distribute the data',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+ "message": "Consider tuning the stripe size and count to better distribute the data",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+ "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
- }
+ "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
def check_individual_write_imbalance_split(max_bytes_written, min_bytes_written):
- '''
+ """
Check whether there is write imbalance in the specific individual file
Parameters:
max_bytes_written: max byte written in the file
min_bytes_written: minimum byte written in the file
- '''
-
- if max_bytes_written and abs(max_bytes_written - min_bytes_written) / max_bytes_written > thresholds['imbalance_size'][0]:
- thresholds['imbalance_size'][1] = True
- issue = 'Load imbalance of {:.2f}% detected'.format(
- abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100
+ """
+
+ if (
+ max_bytes_written
+ and abs(max_bytes_written - min_bytes_written) / max_bytes_written
+ > thresholds["imbalance_size"][0]
+ ):
+ thresholds["imbalance_size"][1] = True
+ issue = "Load imbalance of {:.2f}% detected".format(
+ abs(max_bytes_written - min_bytes_written) / max_bytes_written * 100
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning the stripe size and count to better distribute the data',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+ "message": "Consider tuning the stripe size and count to better distribute the data",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+ "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
- }
+ "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
-def check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None):
- '''
+def check_individual_read_imbalance(
+ imbalance_count, detected_files, file_map, dxt_posix=None, dxt_posix_read_data=None
+):
+ """
Check how many read imbalance when accessing individual files
Parameters:
@@ -1308,57 +1830,62 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d
detected_files:
read imbalance per file
required columns: ['id', 'read_imbalance']
- '''
+ """
if imbalance_count:
- thresholds['imbalance_size'][1] = True
- issue = 'Detected read imbalance when accessing {} individual files.'.format(
+ thresholds["imbalance_size"][1] = True
+ issue = "Detected read imbalance when accessing {} individual files.".format(
imbalance_count
)
detail = []
file_count = 0
dxt_trigger_time = 0
-
+
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
- row['read_imbalance'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": 'Load imbalance of {:.2f}% detected while accessing "{}"'.format(
+ row["read_imbalance"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- if file_count < thresholds['backtrace'][0]:
- temp = dxt_posix.loc[dxt_posix['id'] == int(row['id'])]
- temp_df = dxt_posix_read_data.loc[dxt_posix_read_data['id'] == int(row['id'])]
-
- maxClm = temp_df['length'].max()
- temp_df = temp_df.loc[(temp_df['length'] == maxClm)]
- rank_df = temp.loc[(temp['rank'] == int(temp_df['rank'].iloc[0]))]
-
- rank_df = rank_df['read_segments'].iloc[0]
- stack_memory_addresses = rank_df['stack_memory_addresses'].iloc[0]
- address = dxt_posix.iloc[0]['address_line_mapping']['address']
+ if file_count < thresholds["backtrace"][0]:
+ temp = dxt_posix.loc[dxt_posix["id"] == int(row["id"])]
+ temp_df = dxt_posix_read_data.loc[
+ dxt_posix_read_data["id"] == int(row["id"])
+ ]
+
+ maxClm = temp_df["length"].max()
+ temp_df = temp_df.loc[(temp_df["length"] == maxClm)]
+ rank_df = temp.loc[(temp["rank"] == int(temp_df["rank"].iloc[0]))]
+
+ rank_df = rank_df["read_segments"].iloc[0]
+ stack_memory_addresses = rank_df["stack_memory_addresses"].iloc[0]
+ address = dxt_posix.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_posix.iloc[0]['address_line_mapping'].loc[dxt_posix.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_posix.iloc[0]["address_line_mapping"].loc[
+ dxt_posix.iloc[0]["address_line_mapping"]["address"].isin(res)
+ ]
detail.append(
{
- 'message': 'The backtrace information for these imbalanced read call(s) is given below:'
+ "message": "The backtrace information for these imbalanced read call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
@@ -1366,84 +1893,126 @@ def check_individual_read_imbalance(imbalance_count, detected_files, file_map, d
else:
detail.append(
{
- 'message': 'The backtrace information for this file is similar to the previous files'
+ "message": "The backtrace information for this file is similar to the previous files"
}
)
end = time.time()
time_taken = end - start
dxt_trigger_time += time_taken
- if dxt_trigger_time > 0:
+ if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning the stripe size and count to better distribute the data',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+ "message": "Consider tuning the stripe size and count to better distribute the data",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+ "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
- }
+ "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
def check_individual_read_imbalance_split(max_bytes_read, min_bytes_read):
- '''
+ """
Check whether there is read imbalance in the specific individual file
Parameters:
max_bytes_written: max byte read in the file
min_bytes_written: minimum byte read in the file
- '''
-
- if max_bytes_read and abs(max_bytes_read - min_bytes_read) / max_bytes_read > thresholds['imbalance_size'][0]:
- thresholds['imbalance_size'][1] = True
- issue = 'Load imbalance of {:.2f}% detected'.format(
- abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100
+ """
+
+ if (
+ max_bytes_read
+ and abs(max_bytes_read - min_bytes_read) / max_bytes_read
+ > thresholds["imbalance_size"][0]
+ ):
+ thresholds["imbalance_size"][1] = True
+ issue = "Load imbalance of {:.2f}% detected".format(
+ abs(max_bytes_read - min_bytes_read) / max_bytes_read * 100
)
recommendation = [
{
- 'message': 'Consider better balancing the data transfer between the application ranks'
+ "message": "Consider better balancing the data transfer between the application ranks"
},
{
- 'message': 'Consider tuning the stripe size and count to better distribute the data',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/lustre-striping.bash'), line_numbers=True, background_color='default')
+ "message": "Consider tuning the stripe size and count to better distribute the data",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/lustre-striping.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If the application uses netCDF and HDF5 double-check the need to set NO_FILL values',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/pnetcdf-hdf5-no-fill.c'), line_numbers=True, background_color='default')
+ "message": "If the application uses netCDF and HDF5 double-check the need to set NO_FILL values",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/pnetcdf-hdf5-no-fill.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
},
{
- 'message': 'If rank 0 is the only one opening the file, consider using MPI-IO collectives'
- }
+ "message": "If rank 0 is the only one opening the file, consider using MPI-IO collectives"
+ },
]
insights_operation.append(
- message(INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
# MPIIO level check
-def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio=None):
- '''
+def check_mpi_collective_read_operation(
+ mpiio_coll_reads,
+ mpiio_indep_reads,
+ total_mpiio_read_operations,
+ detected_files,
+ file_map,
+ dxt_mpiio=None,
+):
+ """
Check whether application uses collective mpi read calls
Parameters:
@@ -1454,14 +2023,17 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
independent read operations and percentage per file
required columns: ['id', 'absolute_indep_reads', 'percent_indep_reads']
file_map: file id and file name pairing
- '''
+ """
if mpiio_coll_reads == 0:
- if total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]:
- thresholds['collective_operations_absolute'][1] = True
- issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format(
- mpiio_indep_reads,
- mpiio_indep_reads / total_mpiio_read_operations * 100
+ if (
+ total_mpiio_read_operations
+ and total_mpiio_read_operations
+ > thresholds["collective_operations_absolute"][0]
+ ):
+ thresholds["collective_operations_absolute"][1] = True
+ issue = "Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls".format(
+ mpiio_indep_reads, mpiio_indep_reads / total_mpiio_read_operations * 100
)
detail = []
@@ -1471,63 +2043,80 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': '{} ({}%) of independent reads to "{}"'.format(
- row['absolute_indep_reads'],
- row['percent_indep_reads'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({}%) of independent reads to "{}"'.format(
+ row["absolute_indep_reads"],
+ row["percent_indep_reads"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)]
- temp = temp['read_segments'].iloc[0]
- stack_memory_addresses = temp['stack_memory_addresses'].iloc[0]
- address = dxt_mpiio.iloc[0]['address_line_mapping']['address']
+ temp = dxt_mpiio.loc[
+ (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1)
+ ]
+ temp = temp["read_segments"].iloc[0]
+ stack_memory_addresses = temp["stack_memory_addresses"].iloc[0]
+ address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[
+ dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res)
+ ]
detail.append(
{
- 'message': 'The backtrace information for these read call(s) is given below:'
+ "message": "The backtrace information for these read call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
-
+
end = time.time()
time_taken = end - start
dxt_trigger_time += time_taken
-
- if dxt_trigger_time > 0:
+
+ if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation = [
{
- 'message': 'Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-read.c'), line_numbers=True, background_color='default')
+ "message": "Use collective read operations (e.g. MPI_File_read_all() or MPI_File_read_at_all()) and set one aggregator per compute node",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-read.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
]
insights_operation.append(
- message(INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
else:
- issue = 'Application uses MPI-IO and read data using {} ({:.2f}%) collective operations'.format(
- mpiio_coll_reads,
- mpiio_coll_reads / total_mpiio_read_operations * 100
+ issue = "Application uses MPI-IO and read data using {} ({:.2f}%) collective operations".format(
+ mpiio_coll_reads, mpiio_coll_reads / total_mpiio_read_operations * 100
)
insights_operation.append(
@@ -1535,8 +2124,15 @@ def check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, tot
)
-def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio=None):
- '''
+def check_mpi_collective_write_operation(
+ mpiio_coll_writes,
+ mpiio_indep_writes,
+ total_mpiio_write_operations,
+ detected_files,
+ file_map,
+ dxt_mpiio=None,
+):
+ """
Check whether application uses collective mpi write calls
Parameters:
@@ -1547,14 +2143,18 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
independent write operations and percentage per file
required columns: ['id', 'absolute_indep_writes', 'percent_indep_writes']
file_map: file id and file name pairing
- '''
+ """
if mpiio_coll_writes == 0:
- if total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]:
- thresholds['collective_operations_absolute'][1] = True
- issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format(
+ if (
+ total_mpiio_write_operations
+ and total_mpiio_write_operations
+ > thresholds["collective_operations_absolute"][0]
+ ):
+ thresholds["collective_operations_absolute"][1] = True
+ issue = "Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls".format(
mpiio_indep_writes,
- mpiio_indep_writes / total_mpiio_write_operations * 100
+ mpiio_indep_writes / total_mpiio_write_operations * 100,
)
detail = []
@@ -1564,62 +2164,79 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
for index, row in detected_files.iterrows():
detail.append(
{
- 'message': '{} ({}%) independent writes to "{}"'.format(
- row['absolute_indep_writes'],
- row['percent_indep_writes'],
- file_map[int(row['id'])] if args.full_path else os.path.basename(file_map[int(row['id'])])
- )
+ "message": '{} ({}%) independent writes to "{}"'.format(
+ row["absolute_indep_writes"],
+ row["percent_indep_writes"],
+ file_map[int(row["id"])]
+ if args.full_path
+ else os.path.basename(file_map[int(row["id"])]),
+ )
}
)
# DXT Analysis
if args.backtrace:
start = time.time()
- temp = dxt_mpiio.loc[(dxt_mpiio['id'] == int(row['id'])) & (dxt_mpiio['rank'] == 1)]
- temp = temp['write_segments'].iloc[0]
- stack_memory_addresses = temp['stack_memory_addresses'].iloc[0]
- address = dxt_mpiio.iloc[0]['address_line_mapping']['address']
+ temp = dxt_mpiio.loc[
+ (dxt_mpiio["id"] == int(row["id"])) & (dxt_mpiio["rank"] == 1)
+ ]
+ temp = temp["write_segments"].iloc[0]
+ stack_memory_addresses = temp["stack_memory_addresses"].iloc[0]
+ address = dxt_mpiio.iloc[0]["address_line_mapping"]["address"]
res = set(list(address)) & set(stack_memory_addresses)
- backtrace = dxt_mpiio.iloc[0]['address_line_mapping'].loc[dxt_mpiio.iloc[0]['address_line_mapping']['address'].isin(res)]
+ backtrace = dxt_mpiio.iloc[0]["address_line_mapping"].loc[
+ dxt_mpiio.iloc[0]["address_line_mapping"]["address"].isin(res)
+ ]
detail.append(
{
- 'message': 'The backtrace information for these write call(s) is given below:'
+ "message": "The backtrace information for these write call(s) is given below:"
}
)
for index, row3 in backtrace.iterrows():
detail.append(
{
- 'message': '{}: {}'.format(
- row3['function_name'],
- row3['line_number']
- )
+ "message": "{}: {}".format(
+ row3["function_name"], row3["line_number"]
+ )
}
)
end = time.time()
time_taken = end - start
dxt_trigger_time += time_taken
-
+
if dxt_trigger_time > 0:
detail.append(
{
- 'message': 'Time taken to process this trigger: {}s'.format(round(dxt_trigger_time, 5))
+ "message": "Time taken to process this trigger: {}s".format(
+ round(dxt_trigger_time, 5)
+ )
}
)
recommendation = [
{
- 'message': 'Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-collective-write.c'), line_numbers=True, background_color='default')
+ "message": "Use collective write operations (e.g. MPI_File_write_all() or MPI_File_write_at_all()) and set one aggregator per compute node",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-collective-write.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
]
insights_operation.append(
- message(INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE, TARGET_DEVELOPER, HIGH, issue, recommendation, detail)
+ message(
+ INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ HIGH,
+ issue,
+ recommendation,
+ detail,
+ )
)
else:
- issue = 'Application uses MPI-IO and write data using {} ({:.2f}%) collective operations'.format(
- mpiio_coll_writes,
- mpiio_coll_writes / total_mpiio_write_operations * 100
+ issue = "Application uses MPI-IO and write data using {} ({:.2f}%) collective operations".format(
+ mpiio_coll_writes, mpiio_coll_writes / total_mpiio_write_operations * 100
)
insights_operation.append(
@@ -1627,8 +2244,10 @@ def check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes,
)
-def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules):
- '''
+def check_mpi_none_block_operation(
+ mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules
+):
+ """
Check whether application can benefit from non-blocking requests
Parameters:
@@ -1636,93 +2255,131 @@ def check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_ext
mpiio_nb_writes: number of non-blocking mpi write operations
has_hdf5_extension: boolean value of whether the file in in hdf5 extension
modules: all different mudules been used in the application
- '''
+ """
if mpiio_nb_reads == 0:
- issue = 'Application could benefit from non-blocking (asynchronous) reads'
+ issue = "Application could benefit from non-blocking (asynchronous) reads"
recommendation = []
- if 'H5F' in modules or has_hdf5_extension:
+ if "H5F" in modules or has_hdf5_extension:
recommendation.append(
{
- 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-read.c'), line_numbers=True, background_color='default')
+ "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/hdf5-vol-async-read.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
- if 'MPI-IO' in modules:
+ if "MPI-IO" in modules:
recommendation.append(
{
- 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iread.c'), line_numbers=True, background_color='default')
+ "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iread(), MPI_File_read_all_begin/end(), or MPI_File_read_at_all_begin/end())',
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-iread.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
insights_operation.append(
- message(INSIGHTS_MPI_IO_BLOCKING_READ_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+ message(
+ INSIGHTS_MPI_IO_BLOCKING_READ_USAGE,
+ TARGET_DEVELOPER,
+ WARN,
+ issue,
+ recommendation,
+ )
)
if mpiio_nb_writes == 0:
- issue = 'Application could benefit from non-blocking (asynchronous) writes'
+ issue = "Application could benefit from non-blocking (asynchronous) writes"
recommendation = []
- if 'H5F' in modules or has_hdf5_extension:
+ if "H5F" in modules or has_hdf5_extension:
recommendation.append(
{
- 'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/hdf5-vol-async-write.c'), line_numbers=True, background_color='default')
+ "message": "Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)",
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/hdf5-vol-async-write.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
- if 'MPI-IO' in modules:
+ if "MPI-IO" in modules:
recommendation.append(
{
- 'message': 'Since you use MPI-IO, consider non-blocking/asynchronous I/O operations', # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())',
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-iwrite.c'), line_numbers=True, background_color='default')
+ "message": "Since you use MPI-IO, consider non-blocking/asynchronous I/O operations", # (e.g., MPI_File_iwrite(), MPI_File_write_all_begin/end(), or MPI_File_write_at_all_begin/end())',
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-iwrite.c"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
)
insights_operation.append(
- message(INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, TARGET_DEVELOPER, WARN, issue, recommendation)
+ message(
+ INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
+ TARGET_DEVELOPER,
+ WARN,
+ issue,
+ recommendation,
+ )
)
def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES):
- '''
+ """
Check whether application has used inter-node aggregators
Parameters:
- cb_nodes:
+ cb_nodes:
NUMBER_OF_COMPUTE_NODES:
- '''
+ """
if cb_nodes > NUMBER_OF_COMPUTE_NODES:
- issue = 'Application is using inter-node aggregators (which require network communication)'
+ issue = "Application is using inter-node aggregators (which require network communication)"
recommendation = [
{
- 'message': 'Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})'.format(
+ "message": "Set the MPI hints for the number of aggregators as one per compute node (e.g., cb_nodes={})".format(
NUMBER_OF_COMPUTE_NODES
),
- 'sample': Syntax.from_path(os.path.join(ROOT, 'snippets/mpi-io-hints.bash'), line_numbers=True, background_color='default')
+ "sample": Syntax.from_path(
+ os.path.join(ROOT, "snippets/mpi-io-hints.bash"),
+ line_numbers=True,
+ background_color="default",
+ ),
}
]
insights_operation.append(
- message(INSIGHTS_MPI_IO_AGGREGATORS_INTER, TARGET_USER, HIGH, issue, recommendation)
+ message(
+ INSIGHTS_MPI_IO_AGGREGATORS_INTER,
+ TARGET_USER,
+ HIGH,
+ issue,
+ recommendation,
+ )
)
if cb_nodes < NUMBER_OF_COMPUTE_NODES:
- issue = 'Application is using intra-node aggregators'
+ issue = "Application is using intra-node aggregators"
insights_operation.append(
message(INSIGHTS_MPI_IO_AGGREGATORS_INTRA, TARGET_USER, OK, issue)
)
if cb_nodes == NUMBER_OF_COMPUTE_NODES:
- issue = 'Application is using one aggregator per compute node'
+ issue = "Application is using one aggregator per compute node"
insights_operation.append(
message(INSIGHTS_MPI_IO_AGGREGATORS_OK, TARGET_USER, OK, issue)
@@ -1731,65 +2388,75 @@ def check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES):
# Layout and export
+
def display_content(console):
if insights_metadata:
console.print(
Panel(
- Padding(
- Group(
- *insights_metadata
- ),
- (1, 1)
- ),
- title='METADATA',
- title_align='left'
+ Padding(Group(*insights_metadata), (1, 1)),
+ title="METADATA",
+ title_align="left",
)
)
if insights_operation:
console.print(
Panel(
- Padding(
- Group(
- *insights_operation
- ),
- (1, 1)
- ),
- title='OPERATIONS',
- title_align='left'
+ Padding(Group(*insights_operation), (1, 1)),
+ title="OPERATIONS",
+ title_align="left",
)
)
if insights_dxt:
console.print(
Panel(
- Padding(
- Group(
- *insights_dxt
- ),
- (1, 1)
- ),
- title='DXT',
- title_align='left'
+ Padding(Group(*insights_dxt), (1, 1)), title="DXT", title_align="left"
)
)
def display_thresholds(console):
tholdMessage = {
- 'imbalance_operations': 'Minimum imbalance requests ratio: [white]{}%[/white]'.format(thresholds['imbalance_operations'][0] * 100),
- 'small_bytes': 'Minimum size of a small request: [white]{} bytes[/white]'.format(thresholds['small_bytes'][0]),
- 'small_requests': 'Maximum small requests ratio: [white]{}%[/white]'.format(thresholds['small_requests'][0] * 100),
- 'small_requests_absolute': 'Maximum small requests: [white]{}[/white]'.format(thresholds['small_requests_absolute'][0]),
- 'misaligned_requests': 'Maximum misaligned requests ratio: [white]{}%[/white]'.format(thresholds['misaligned_requests'][0] * 100),
- 'random_operations': 'Maximum random request ratio: [white]{}%[/white]'.format(thresholds['random_operations'][0] * 100),
- 'random_operations_absolute': 'Maximum random requests: [white]{}[/white]'.format(thresholds['random_operations_absolute'][0]),
- 'metadata_time_rank': 'Maximum metadata process time per rank: [white]{} seconds[/white]'.format(thresholds['metadata_time_rank'][0]),
- 'imbalance_size': 'Maximum read/write size difference ratio: [white]{}%[/white]'.format(thresholds['imbalance_size'][0] * 100),
- 'imbalance_stragglers': 'Maximum ratio difference among ranks: [white]{}%[/white]'.format(thresholds['imbalance_stragglers'][0] * 100),
- 'interface_stdio': 'Maximum STDIO usage ratio: [white]{}%[/white]'.format(thresholds['interface_stdio'][0] * 100),
- 'collective_operations': 'Minimum MPI collective operation usage ratio: [white]{}%[/white]'.format(thresholds['collective_operations'][0] * 100),
- 'collective_operations_absolute': 'Minimum MPI collective operations: [white]{}[/white]'.format(thresholds['collective_operations_absolute'][0]),
+ "imbalance_operations": "Minimum imbalance requests ratio: [white]{}%[/white]".format(
+ thresholds["imbalance_operations"][0] * 100
+ ),
+ "small_bytes": "Minimum size of a small request: [white]{} bytes[/white]".format(
+ thresholds["small_bytes"][0]
+ ),
+ "small_requests": "Maximum small requests ratio: [white]{}%[/white]".format(
+ thresholds["small_requests"][0] * 100
+ ),
+ "small_requests_absolute": "Maximum small requests: [white]{}[/white]".format(
+ thresholds["small_requests_absolute"][0]
+ ),
+ "misaligned_requests": "Maximum misaligned requests ratio: [white]{}%[/white]".format(
+ thresholds["misaligned_requests"][0] * 100
+ ),
+ "random_operations": "Maximum random request ratio: [white]{}%[/white]".format(
+ thresholds["random_operations"][0] * 100
+ ),
+ "random_operations_absolute": "Maximum random requests: [white]{}[/white]".format(
+ thresholds["random_operations_absolute"][0]
+ ),
+ "metadata_time_rank": "Maximum metadata process time per rank: [white]{} seconds[/white]".format(
+ thresholds["metadata_time_rank"][0]
+ ),
+ "imbalance_size": "Maximum read/write size difference ratio: [white]{}%[/white]".format(
+ thresholds["imbalance_size"][0] * 100
+ ),
+ "imbalance_stragglers": "Maximum ratio difference among ranks: [white]{}%[/white]".format(
+ thresholds["imbalance_stragglers"][0] * 100
+ ),
+ "interface_stdio": "Maximum STDIO usage ratio: [white]{}%[/white]".format(
+ thresholds["interface_stdio"][0] * 100
+ ),
+ "collective_operations": "Minimum MPI collective operation usage ratio: [white]{}%[/white]".format(
+ thresholds["collective_operations"][0] * 100
+ ),
+ "collective_operations_absolute": "Minimum MPI collective operations: [white]{}[/white]".format(
+ thresholds["collective_operations_absolute"][0]
+ ),
}
toBeAppend = []
@@ -1802,24 +2469,19 @@ def display_thresholds(console):
toBeAppend.append(message)
console.print(
- Panel(
- '\n'.join(toBeAppend),
- title='THRESHOLDS',
- title_align='left',
- padding=1
- )
+ Panel("\n".join(toBeAppend), title="THRESHOLDS", title_align="left", padding=1)
)
def display_footer(console, insights_start_time, insights_end_time):
console.print(
Panel(
- ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format(
+ " {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds".format(
datetime.datetime.now().year,
datetime.datetime.now(),
- insights_end_time - insights_start_time
+ insights_end_time - insights_start_time,
),
- box=box.SIMPLE
+ box=box.SIMPLE,
)
)
@@ -1828,37 +2490,28 @@ def export_html(console, export_dir, trace_name):
if not args.export_html:
return
- os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+ os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
filepath = os.path.join(export_dir, f"{trace_name}.html")
- console.save_html(
- filepath,
- theme=set_export_theme(),
- clear=False
- )
+ console.save_html(filepath, theme=set_export_theme(), clear=False)
def export_svg(console, export_dir, trace_name):
if not args.export_svg:
return
-
- os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+
+ os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
filepath = os.path.join(export_dir, f"{trace_name}.svg")
- console.save_svg(
- filepath,
- title='Drishti',
- theme=set_export_theme(),
- clear=False
- )
+ console.save_svg(filepath, title="Drishti", theme=set_export_theme(), clear=False)
def export_csv(export_dir, trace_name, jobid=None):
if not args.export_csv:
return
-
+
issues = [
- 'JOB',
+ "JOB",
INSIGHTS_STDIO_HIGH_USAGE,
INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
INSIGHTS_POSIX_READ_COUNT_INTENSIVE,
@@ -1890,23 +2543,21 @@ def export_csv(export_dir, trace_name, jobid=None):
INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
INSIGHTS_MPI_IO_AGGREGATORS_INTRA,
INSIGHTS_MPI_IO_AGGREGATORS_INTER,
- INSIGHTS_MPI_IO_AGGREGATORS_OK
+ INSIGHTS_MPI_IO_AGGREGATORS_OK,
]
if codes:
issues.extend(codes)
detected_issues = dict.fromkeys(issues, False)
- detected_issues['JOB'] = jobid
+ detected_issues["JOB"] = jobid
for report in csv_report:
detected_issues[report] = True
-
- os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
+ os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
filepath = os.path.join(export_dir, f"{trace_name}.csv")
- with open(filepath, 'w') as f:
+ with open(filepath, "w") as f:
w = csv.writer(f)
w.writerow(detected_issues.keys())
w.writerow(detected_issues.values())
-
diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py
index 28dcd63..ed58b1d 100644
--- a/drishti/includes/parser.py
+++ b/drishti/includes/parser.py
@@ -1,128 +1,120 @@
import argparse
-parser = argparse.ArgumentParser(
- description='Drishti: '
-)
+parser = argparse.ArgumentParser(description="Drishti: ")
parser.add_argument(
- 'log_path',
- help='Input .darshan file or recorder folder'
+ "log_paths", nargs="+", help="Input .darshan file or recorder folder"
)
parser.add_argument(
- '--issues',
+ "--issues",
default=False,
- action='store_true',
- dest='only_issues',
- help='Only displays the detected issues and hides the recommendations'
+ action="store_true",
+ dest="only_issues",
+ help="Only displays the detected issues and hides the recommendations",
)
parser.add_argument(
- '--html',
+ "--html",
default=False,
- action='store_true',
- dest='export_html',
- help='Export the report as an HTML page'
+ action="store_true",
+ dest="export_html",
+ help="Export the report as an HTML page",
)
parser.add_argument(
- '--svg',
+ "--svg",
default=False,
- action='store_true',
- dest='export_svg',
- help='Export the report as an SVG image'
+ action="store_true",
+ dest="export_svg",
+ help="Export the report as an SVG image",
)
parser.add_argument(
- '--light',
+ "--light",
default=False,
- action='store_true',
- dest='export_theme_light',
- help='Use a light theme for the report when generating files'
+ action="store_true",
+ dest="export_theme_light",
+ help="Use a light theme for the report when generating files",
)
parser.add_argument(
- '--size',
+ "--size",
default=False,
- dest='export_size',
- help='Console width used for the report and generated files'
+ dest="export_size",
+ help="Console width used for the report and generated files",
)
parser.add_argument(
- '--verbose',
+ "--verbose",
default=False,
- action='store_true',
- dest='verbose',
- help='Display extended details for the recommendations'
+ action="store_true",
+ dest="verbose",
+ help="Display extended details for the recommendations",
)
parser.add_argument(
- '--threshold',
+ "--threshold",
default=False,
- action='store_true',
- dest='thold',
- help='Display all thresholds used for the report'
+ action="store_true",
+ dest="thold",
+ help="Display all thresholds used for the report",
)
parser.add_argument(
- '--code',
+ "--code",
default=False,
- action='store_true',
- dest='code',
- help='Display insights identification code'
+ action="store_true",
+ dest="code",
+ help="Display insights identification code",
)
parser.add_argument(
- '--backtrace',
+ "--backtrace",
default=False,
- action='store_true',
- dest='backtrace',
- help='Enable DXT insights and backtrace'
+ action="store_true",
+ dest="backtrace",
+ help="Enable DXT insights and backtrace",
)
parser.add_argument(
- '--path',
+ "--path",
default=False,
- action='store_true',
- dest='full_path',
- help='Display the full file path for the files that triggered the issue'
+ action="store_true",
+ dest="full_path",
+ help="Display the full file path for the files that triggered the issue",
)
parser.add_argument(
- '--csv',
+ "--csv",
default=False,
- action='store_true',
- dest='export_csv',
- help='Export a CSV with the code of all issues that were triggered'
+ action="store_true",
+ dest="export_csv",
+ help="Export a CSV with the code of all issues that were triggered",
)
parser.add_argument(
- '--export_dir',
+ "--export_dir",
default="",
- dest='export_dir',
- help='Specify the directory prefix for the output files (if any)'
+ dest="export_dir",
+ help="Specify the directory prefix for the output files (if any)",
)
-parser.add_argument(
- '--json',
- default=False,
- dest='json',
- help=argparse.SUPPRESS
-)
+parser.add_argument("--json", default=False, dest="json", help=argparse.SUPPRESS)
parser.add_argument(
- '--split',
+ "--split",
default=False,
- action='store_true',
- dest='split_files',
- help='Split the files and generate report for each file'
+ action="store_true",
+ dest="split_files",
+ help="Split the files and generate report for each file",
)
parser.add_argument(
- '--config',
+ "--config",
default=False,
- dest='config',
- help='Enable thresholds read from json file'
+ dest="config",
+ help="Enable thresholds read from json file",
)
args = parser.parse_args()
diff --git a/drishti/reporter.py b/drishti/reporter.py
index 8455040..a6a8401 100644
--- a/drishti/reporter.py
+++ b/drishti/reporter.py
@@ -3,10 +3,12 @@
import os
import sys
from subprocess import call
-from drishti.includes.parser import *
+from typing import List, Optional
+# from includes.parser import * # imports {'parser', 'args', 'argparse'}
+from drishti.includes.parser import args
-'''
+"""
|- handler_darshan -|
| |
reporter -> /handlers -> |- handler_recorder -| -|
@@ -15,8 +17,7 @@
________________________________________________|
|
|-----> /includes -> module -> config -> parser
-'''
-
+"""
LOG_TYPE_DARSHAN = 0
LOG_TYPE_RECORDER = 1
@@ -26,30 +27,57 @@ def clear():
"""
Clear the screen with the comment call based on the operating system.
"""
- _ = call('clear' if os.name == 'posix' else 'cls')
+ _ = call("clear" if os.name == "posix" else "cls")
+
+
+def check_log_type(paths: List[str]) -> Optional[int]:
+ is_darshan = True
+ is_recorder = True
+ multiple_logs = len(paths) > 1
+ for path in paths:
+ if path.endswith(".darshan"):
+ if not os.path.isfile(path):
+ print("Unable to open .darshan file.")
+ sys.exit(os.EX_NOINPUT)
+ else:
+ is_darshan = True and is_darshan
+ is_recorder = False and is_recorder
+ else: # check whether is a valid recorder log
+ if not os.path.isdir(path):
+ print("Unable to open recorder folder.")
+ sys.exit(os.EX_NOINPUT)
+ else:
+ is_recorder = True and is_recorder
+ is_darshan = False and is_darshan
-def check_log_type(path):
- if path.endswith('.darshan'):
- if not os.path.isfile(path):
- print('Unable to open .darshan file.')
+ if multiple_logs:
+ if is_darshan:
+ return LOG_TYPE_DARSHAN
+ else:
+ print("Only .darshan files are supported for multiple logs.")
sys.exit(os.EX_NOINPUT)
- else: return LOG_TYPE_DARSHAN
- else: # check whether is a valid recorder log
- if not os.path.isdir(path):
- print('Unable to open recorder folder.')
+ else:
+ if is_darshan and not is_recorder:
+ return LOG_TYPE_DARSHAN
+ elif is_recorder and not is_darshan:
+ return LOG_TYPE_RECORDER
+ else:
+ print("Unable to reliably determine the log type.")
sys.exit(os.EX_NOINPUT)
- else: return LOG_TYPE_RECORDER
def main():
- log_type = check_log_type(args.log_path)
-
+ log_type = check_log_type(args.log_paths)
+
if log_type == LOG_TYPE_DARSHAN:
from drishti.handlers.handle_darshan import handler
elif log_type == LOG_TYPE_RECORDER:
from drishti.handlers.handle_recorder import handler
-
+
handler()
+
+if __name__ == "__main__":
+ main()
diff --git a/setup.py b/setup.py
index a93a8ce..c3b9d6c 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,13 @@
'rich==12.5.1',
'recorder-utils',
],
+ extras_require={
+ 'dev': [
+ 'ruff',
+ 'isort',
+ 'mypy'
+ ],
+ },
packages=find_packages(),
package_data={
'drishti.includes': [