Skip to content

【开源实习】在arxiv上发表基于MindSpore的原生论文(4) #203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions research/arxiv_papers/MindFlow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

## 安装步骤

1. **克隆仓库**
下载项目文件或克隆此仓库。

2. **安装依赖**
使用 `pip` 安装 `requirements.txt` 中列出的依赖包。

```bash
pip install -r requirements.txt
```
## 使用说明

要运行项目,使用以下命令:

```bash
python main.py
```

## 感谢MindSpore社区提供的支持
49 changes: 49 additions & 0 deletions research/arxiv_papers/MindFlow/model/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import platform
import mindspore.dataset as ds


class SafeTrafficDataset:
def __init__(self, data, labels, seq_length=10):
self.data = data
self.labels = labels
self.seq_length = seq_length

if len(data) != len(labels):
raise ValueError("数据与标签长度不一致")
if seq_length < 1:
raise ValueError("序列长度必须大于0")

def __getitem__(self, index):
if index + self.seq_length > len(self.data):
raise IndexError("索引超出数据范围")
return (self.data[index:index + self.seq_length],
self.labels[index + self.seq_length - 1])

def __len__(self):
return len(self.data) - self.seq_length + 1


def create_dataloader(data, labels, seq_length=10, batch_size=256, shuffle=True):
"""创建数据加载器

Args:
data (np.ndarray): 特征数据
labels (np.ndarray): 标签数据
seq_length (int): 序列长度
batch_size (int): 批次大小
shuffle (bool): 是否打乱数据

Returns:
ds.GeneratorDataset: 数据加载器
"""
cpu_count = os.cpu_count() or 1
safe_num_workers = max(1, min(cpu_count - 1, 4)) # 限制最大为4

dataset = ds.GeneratorDataset(
source=SafeTrafficDataset(data, labels, seq_length),
column_names=["data", "label"],
shuffle=shuffle,
num_parallel_workers=1 if platform.system() == 'Windows' else safe_num_workers
)
return dataset.batch(batch_size, drop_remainder=True)
37 changes: 37 additions & 0 deletions research/arxiv_papers/MindFlow/model/data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder


def load_nf_bot_iot_data(file_path):
"""加载并预处理NF-BoT-IoT数据集

Args:
file_path (str): 数据文件路径

Returns:
tuple: (features, labels, label_encoder)
"""
df = pd.read_parquet(file_path)

if 'Label' not in df.columns:
raise ValueError("数据集中必须包含'Label'列")

labels = df['Label'].copy()
features = df.drop(columns=['Label'])

# 编码标签
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels).astype(np.float32)

# 处理特征列
for col in features.columns:
try:
features[col] = pd.to_numeric(features[col], errors='raise')
except (ValueError, TypeError):
le = LabelEncoder()
features[col] = le.fit_transform(features[col])

# 标准化并转换为 float32
features = StandardScaler().fit_transform(features).astype(np.float32)
return features, labels, label_encoder
86 changes: 86 additions & 0 deletions research/arxiv_papers/MindFlow/model/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import gc
import psutil
import numpy as np
import mindspore.nn as nn
import mindspore as ms
from sklearn.model_selection import train_test_split
from data_preprocessing import load_nf_bot_iot_data
from data_loader import create_dataloader
from model import CyberDefenseModel
from train_eval import AdaptiveTrainer, evaluate, generate_predictions_csv

def main():
# 内存监控
print(f"初始内存: {psutil.virtual_memory().used // 1024 // 1024} MB")

# 数据加载
try:
features, labels, label_encoder = load_nf_bot_iot_data(r"D:\pythonpro\MindSpore\data\archive\NF-BoT-IoT.parquet")
except Exception as e:
print(f"数据加载失败: {str(e)}")
exit(1)

# 数据集划分
try:
X_temp, X_test, y_temp, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)
except ValueError as e:
print(f"数据集划分错误: {str(e)}")
exit(1)

# 创建数据加载器
seq_length = 10
batch_size = 256
try:
train_loader = create_dataloader(X_train, y_train, seq_length=seq_length, batch_size=batch_size)
val_loader = create_dataloader(X_val, y_val, seq_length=seq_length, batch_size=batch_size, shuffle=False)
test_loader = create_dataloader(X_test, y_test, seq_length=seq_length, batch_size=batch_size, shuffle=False)
except Exception as e:
print(f"数据加载器创建失败: {str(e)}")
exit(1)

# 初始化模型
model = CyberDefenseModel(input_dim=X_train.shape[1])
loss_fn = nn.BCELoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.0001)
trainer = AdaptiveTrainer(model, loss_fn, optimizer)

# 训练循环
best_f1 = 0
for epoch in range(20):
model.set_train()
epoch_loss = 0
for batch_data, batch_labels in train_loader:
loss = trainer.train_step(batch_data, batch_labels)
epoch_loss += loss.asnumpy()

if (epoch + 1) % 5 == 0:
trainer.adjust_threshold(val_loader)
val_metrics = evaluate(model, val_loader)

if val_metrics['F1'] > best_f1:
best_f1 = val_metrics['F1']
ms.save_checkpoint(model, "best_model.ckpt")

print(f"[Epoch {epoch + 1}] 阈值: {model.threshold.asnumpy()[0]:.4f} | "
f"验证集 F1: {val_metrics['F1']:.4f}")

print(f"Epoch {epoch + 1}: 训练损失={epoch_loss / len(train_loader):.4f}")

# 最终测试
ms.load_param_into_net(model, ms.load_checkpoint("best_model.ckpt"))
test_metrics = evaluate(model, test_loader)
print("\n=== 测试结果 ===")
print(f"准确率: {test_metrics['Accuracy']:.4f}")
print(f"精确率: {test_metrics['Precision']:.4f}")
print(f"召回率: {test_metrics['Recall']:.4f}")
print(f"F1分数: {test_metrics['F1']:.4f}")

# 生成预测结果的CSV文件
generate_predictions_csv(model, test_loader, label_encoder, output_path="test_predictions.csv")

# 训练结束后清理内存
gc.collect()

if __name__ == "__main__":
main()
33 changes: 33 additions & 0 deletions research/arxiv_papers/MindFlow/model/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor

class CyberDefenseModel(nn.Cell):
def __init__(self, input_dim):
super().__init__()
self.cnn = nn.SequentialCell(
nn.Conv1d(input_dim, 16, kernel_size=3, padding=1, pad_mode='pad'),
nn.ReLU(),
nn.MaxPool1d(2, stride=2),
nn.Conv1d(16, 32, kernel_size=3, padding=1, pad_mode='pad'),
nn.ReLU(),
nn.MaxPool1d(5, stride=5)
)

self.lstm = nn.LSTM(32, 64, bidirectional=True, batch_first=True)
self.threshold = ms.Parameter(Tensor([0.6], ms.float32), requires_grad=False)
self.fc = nn.SequentialCell(
nn.Dense(128, 32),
nn.ReLU(),
nn.Dense(32, 1)
)

def construct(self, x):
x = ops.transpose(x, (0, 2, 1))
cnn_out = self.cnn(x)
cnn_out = ops.transpose(cnn_out, (0, 2, 1))
lstm_out, _ = self.lstm(cnn_out)
last_out = lstm_out[:, -1, :]
logits = self.fc(last_out).squeeze()
return ops.sigmoid(logits), self.threshold
74 changes: 74 additions & 0 deletions research/arxiv_papers/MindFlow/model/train_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np
import pandas as pd
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops

class AdaptiveTrainer:
def __init__(self, model, loss_fn, optimizer):
self.model = model
self.loss_fn = loss_fn
self.optimizer = optimizer
self.grad_fn = None

def forward_fn(self, data, labels):
preds, _ = self.model(data)
return self.loss_fn(preds, labels)

def adjust_threshold(self, val_loader):
metrics = evaluate(self.model, val_loader, use_threshold=False)
new_threshold = self.model.threshold * (1 + 0.1 * (metrics['F1'] - 0.9))
self.model.threshold = ops.clip_by_value(new_threshold, 0.4, 0.7)

def train_step(self, data, labels):
if self.grad_fn is None:
self.grad_fn = ms.value_and_grad(self.forward_fn, None, self.optimizer.parameters)
loss, grads = self.grad_fn(data, labels)
self.optimizer(grads)
return loss

def evaluate(model, loader, use_threshold=True):
model.set_train(False)
y_true, y_pred = [], []
threshold = model.threshold.asnumpy()[0] if use_threshold else 0.5

for batch_data, batch_labels in loader:
preds, _ = model(batch_data)
y_true.extend(batch_labels.asnumpy().tolist())
y_pred.extend((preds.asnumpy() > threshold).astype(int).tolist())

y_true, y_pred = np.array(y_true), np.array(y_pred)
TP = np.sum((y_pred == 1) & (y_true == 1))
FP = np.sum((y_pred == 1) & (y_true == 0))
TN = np.sum((y_pred == 0) & (y_true == 0))
FN = np.sum((y_pred == 0) & (y_true == 1))

precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
return {
"Accuracy": (TP + TN) / len(y_true),
"Precision": precision,
"Recall": recall,
"F1": 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
}

def generate_predictions_csv(model, loader, label_encoder, output_path="predictions.csv"):
model.set_train(False)
y_true, y_pred = [], []

for batch_data, batch_labels in loader:
preds, _ = model(batch_data)
y_true.extend(batch_labels.asnumpy().tolist())
y_pred.extend((preds.asnumpy() > 0.5).astype(int).tolist())

y_true = np.array(y_true).astype(int)
y_pred = np.array(y_pred).astype(int)
y_true_labels = label_encoder.inverse_transform(y_true)
y_pred_labels = label_encoder.inverse_transform(y_pred)

df = pd.DataFrame({
'True_Label': y_true_labels,
'Predicted_Label': y_pred_labels
})
df.to_csv(output_path, index=False)
print(f"预测结果已保存到 {output_path}")
13 changes: 13 additions & 0 deletions research/arxiv_papers/MindFlow/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
os
platform
mindspore.dataset
numpy
pandas
sklearn.preprocessing
gc
psutil
mindspore.nn
sklearn.model_selection
model
mindspore.ops
mindspore
Binary file modified research/huawei-noah/DRD/preprocessing/svm_rank_classify.exe
Binary file not shown.
Binary file modified research/huawei-noah/DRD/preprocessing/svm_rank_learn.exe
Binary file not shown.
Loading