Skip to content

Commit 6f39e5c

Browse files
authored
Merge pull request #81 from R1c3P0T5/jerry/backend/all_on_zip
Jerry/backend/script and table
2 parents 7df5aaa + f31adac commit 6f39e5c

15 files changed

Lines changed: 1175 additions & 242 deletions

backend/README.md

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,22 @@ uv run alembic upgrade head
2222

2323
## 匯入樣本(`scripts/load_malware_seeds.py`
2424

25-
1. 將含有惡意樣本的 zip 檔放到 `backend/data/samples/`
26-
- Zip 預設密碼為 `infected`
27-
- 單一 zip 可以包含多個樣本檔案
28-
2. (選用)在 `backend/data/malwareseed/` 內建立與 zip 同名的 JSON(`<zip_name>.json`)。
29-
- 每個條目以 `sha256` 為主鍵,對應 zip 內樣本檔案的雜湊
30-
- 未提供的欄位會自動套用預設值(platform = `Unknown`、family = `Unknown`、source = `ZIP:<zipname>`
31-
25+
1. 將含有惡意樣本與對應 JSON 的 zip 放到 `backend/data/samples/`
26+
- Zip 密碼為 `infected`
27+
2. JSON 條目以 `sha256` 作為主鍵,未填欄位會套用預設值
28+
3. JSON 檔名統一為 `seed.json`
3229

33-
範例(`malwareseed/batch.json`
30+
範例 zip 內容:
31+
32+
```
33+
.zip
34+
└── seed.json
35+
└── payload/
36+
├── sample-1.bin
37+
└── sample-2.bin
38+
```
39+
40+
範例 JSON:
3441

3542
```json
3643
[
@@ -44,27 +51,32 @@ uv run alembic upgrade head
4451
]
4552
```
4653

47-
48-
49-
5054
## 匯入特徵並建立關聯(`scripts/load_feature_seeds.py`
5155

5256
1.`backend/data/seed/` 放置特徵 JSON 檔
53-
2. 若需要將特徵綁定到特定樣本,於條目中新增 `sample_sha256` 欄位
54-
55-
範例:
57+
2. 若需綁定樣本,於條目加入 `sample_sha256` 陣列
5658

5759
```json
58-
{
59-
"feature_name": "",
60-
"feature_type": "",
61-
"yara_rule": "",
62-
"sample_sha256": [
63-
""
64-
]
65-
}
60+
[
61+
{
62+
"feature_name": "",
63+
"feature_type": "",
64+
"yara_rule": null,
65+
"machine_code": null,
66+
"assembly": null,
67+
"pseudo_c": "",
68+
"behavior": "",
69+
"tags": [
70+
""
71+
],
72+
"created_by": "",
73+
"mitre_ids": [
74+
""
75+
],
76+
"sample_sha256": [
77+
""
78+
]
79+
}
80+
]
6681
```
6782

68-
69-
70-
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""change family datatype
2+
3+
Revision ID: 32d263976ad4
4+
Revises: af25782c1c4f
5+
Create Date: 2025-11-26 16:22:25.261198
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "32d263976ad4"
16+
down_revision: Union[str, None] = "af25782c1c4f"
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
"""Upgrade schema."""
23+
# ### commands auto generated by Alembic - please adjust! ###
24+
op.alter_column(
25+
"malware",
26+
"family",
27+
existing_type=sa.VARCHAR(),
28+
type_=sa.JSON(),
29+
nullable=False,
30+
server_default=sa.text("'[]'::json"),
31+
postgresql_using="""
32+
CASE
33+
WHEN family IS NULL OR btrim(family) = '' THEN '[]'::json
34+
WHEN family LIKE '[%%' THEN family::json
35+
ELSE json_build_array(family)::json
36+
END
37+
""",
38+
)
39+
# ### end Alembic commands ###
40+
41+
42+
def downgrade() -> None:
43+
"""Downgrade schema."""
44+
# ### commands auto generated by Alembic - please adjust! ###
45+
op.alter_column(
46+
"malware",
47+
"family",
48+
existing_type=sa.JSON(),
49+
type_=sa.VARCHAR(),
50+
nullable=True,
51+
server_default=None,
52+
postgresql_using="""
53+
CASE
54+
WHEN family IS NULL OR json_array_length(family) = 0 THEN NULL
55+
ELSE family->>0
56+
END
57+
""",
58+
)
59+
# ### end Alembic commands ###
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""add mitre_ids for malware table
2+
3+
Revision ID: af25782c1c4f
4+
Revises: 962432c47268
5+
Create Date: 2025-11-26 14:31:49.003153
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "af25782c1c4f"
16+
down_revision: Union[str, None] = "962432c47268"
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
"""Upgrade schema."""
23+
# ### commands auto generated by Alembic - please adjust! ###
24+
op.add_column(
25+
"malware",
26+
sa.Column(
27+
"mitre_ids", sa.JSON(), server_default=sa.text("'[]'::json"), nullable=False
28+
),
29+
)
30+
# ### end Alembic commands ###
31+
32+
33+
def downgrade() -> None:
34+
"""Downgrade schema."""
35+
# ### commands auto generated by Alembic - please adjust! ###
36+
op.drop_column("malware", "mitre_ids")
37+
# ### end Alembic commands ###

backend/app/malware/models.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from datetime import datetime
22
from typing import TYPE_CHECKING
33

4-
from sqlalchemy import Column, Integer, text
4+
from pydantic import field_validator
5+
from sqlalchemy import JSON, Column, Integer, text
56
from sqlalchemy.orm import relationship
67
from sqlmodel import Field, Relationship, SQLModel
78

89
from app.features.models import FeaturePublic
910
from app.links.models import MalwareFeatureLink
11+
from app.malware.utils import normalize_family_values
1012

1113
if TYPE_CHECKING: # pragma: no cover - type checking only
1214
from app.features.models import Feature
@@ -19,9 +21,23 @@ class MalwareBase(SQLModel):
1921
md5: str
2022
size: int
2123
sha256: str
22-
family: str | None = None
24+
family: list[str] = Field(
25+
default_factory=list,
26+
sa_column=Column(JSON, nullable=False, server_default=text("'[]'::json")),
27+
description="惡意家族標籤列表",
28+
)
2329
source: str | None = None
2430
platform: str
31+
mitre_ids: list[str] = Field(
32+
default_factory=list,
33+
sa_column=Column(JSON, nullable=False, server_default=text("'[]'::json")),
34+
description="對應的 MITRE ATT&CK 技術 ID",
35+
)
36+
37+
@field_validator("family", mode="before")
38+
@classmethod
39+
def _ensure_family_list(cls, value): # type: ignore[override]
40+
return normalize_family_values(value)
2541

2642

2743
class Malware(MalwareBase, table=True):
@@ -57,9 +73,17 @@ class MalwareUpdate(SQLModel, table=False):
5773

5874
filename: str | None = None
5975
source: str | None = None
60-
family: str | None = None
76+
family: list[str] | None = None
6177
platform: str | None = None
6278
last_seen: datetime | None = None
79+
mitre_ids: list[str] | None = None
80+
81+
@field_validator("family", mode="before")
82+
@classmethod
83+
def _ensure_update_family(cls, value): # type: ignore[override]
84+
if value is None:
85+
return None
86+
return normalize_family_values(value)
6387

6488

6589
class MalwareDelete(SQLModel):

backend/app/malware/routes.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import shutil
23
from pathlib import Path
34
from typing import List
@@ -31,7 +32,7 @@
3132
MalwarePublic,
3233
MalwareUpdate,
3334
)
34-
from .utils import calc_hashes_from_fileobj
35+
from .utils import calc_hashes_from_fileobj, normalize_family_values
3536

3637
router = APIRouter(
3738
prefix="/malware", tags=["Malware"], dependencies=[Depends(get_current_user_dep)]
@@ -40,6 +41,26 @@
4041
SAMPLE_STORAGE_DIR = Path(__file__).resolve().parents[2] / "static" / "samples"
4142

4243

44+
def parse_mitre_ids(raw: str | None) -> list[str]:
45+
if not raw:
46+
return []
47+
try:
48+
parsed = json.loads(raw)
49+
except json.JSONDecodeError:
50+
parsed = None
51+
52+
values: list[str]
53+
if isinstance(parsed, list):
54+
values = [str(item).strip() for item in parsed if str(item).strip()]
55+
else:
56+
values = [segment.strip() for segment in raw.split(",") if segment.strip()]
57+
return values
58+
59+
60+
def parse_family(raw: str | None) -> list[str]:
61+
return normalize_family_values(raw)
62+
63+
4364
@router.get(
4465
"/{sample_id}",
4566
summary="根據 sample_id 取得惡意軟體資訊",
@@ -62,6 +83,7 @@ async def upload_malware(
6283
platform: str = Form(...),
6384
family: str | None = Form(None),
6485
source: str | None = Form(None),
86+
mitre_ids: str | None = Form(None),
6587
db: Session = Depends(get_session),
6688
):
6789
# 計算 hash
@@ -73,9 +95,10 @@ async def upload_malware(
7395
md5=md5,
7496
size=size,
7597
sha256=sha256,
76-
family=family,
98+
family=parse_family(family),
7799
source=source,
78100
platform=platform,
101+
mitre_ids=parse_mitre_ids(mitre_ids),
79102
)
80103

81104
if get_by_sha256(db, sha256):

backend/app/malware/utils.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,43 @@
11
import hashlib
2-
from typing import BinaryIO, Tuple
2+
from typing import Any, BinaryIO, Tuple
3+
4+
5+
def normalize_family_values(value: Any) -> list[str]:
6+
"""Normalize arbitrary input into a list of family labels."""
7+
8+
def _clean(items: list[str]) -> list[str]:
9+
seen = set()
10+
result: list[str] = []
11+
for item in items:
12+
cleaned = item.strip()
13+
if not cleaned or cleaned in seen:
14+
continue
15+
seen.add(cleaned)
16+
result.append(cleaned)
17+
return result
18+
19+
if value is None:
20+
return []
21+
if isinstance(value, list):
22+
return _clean([str(item) for item in value])
23+
if isinstance(value, str):
24+
stripped = value.strip()
25+
if not stripped:
26+
return []
27+
if stripped.startswith("[") and stripped.endswith("]"):
28+
try:
29+
import json
30+
31+
parsed = json.loads(stripped)
32+
except json.JSONDecodeError:
33+
parsed = None
34+
if isinstance(parsed, list):
35+
return _clean([str(item) for item in parsed])
36+
fragments = [segment for segment in stripped.replace("\n", ",").split(",")]
37+
return _clean(fragments) or _clean([stripped])
38+
normalized = str(value).strip()
39+
return _clean([normalized]) if normalized else []
40+
341

442
CHUNK = 1048576 # 1MB
543

backend/data/malwareseed/malware_samples.json

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[
2+
{
3+
"sha256": "c32fb305903a22106c6d3def0ac6c05b4f16cba99e23527b6c61d617ea794b1d",
4+
"filename": "example.exe",
5+
"platform": "windows",
6+
"family": "Unknown",
7+
"source": "MOTIF"
8+
}
9+
]

backend/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dependencies = [
1616
"langchain>=1.0.0",
1717
"langchain-google-genai>=3.0.0",
1818
"pytest>=8.4.2",
19+
"py7zr>=0.21.0",
1920
]
2021

2122
[dependency-groups]

0 commit comments

Comments
 (0)