Skip to content

Commit 7f55388

Browse files
authored
Merge pull request #1369 from PyThaiNLP/copilot/bug-fix-pythainlp-path-manipulation
fix: replace os.path.join with safe_path_join to prevent path manipulation (CWE-22)
2 parents a4296cb + 19a09c7 commit 7f55388

File tree

9 files changed

+69
-52
lines changed

9 files changed

+69
-52
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ and this project adheres to
3838
- thai2rom_onnx: fix ONNX encoder model and fix inference bugs (#1349)
3939
- wordnet: fix AttributeError (#1354)
4040

41+
### Security
42+
43+
- Replace `os.path.join` with `safe_path_join` throughout the codebase
44+
to prevent path manipulation vulnerabilities (CWE-22) (#1369)
45+
4146
## [5.3.2] - 2026-03-19
4247

4348
This release focuses on security improvements related to path traversal

pythainlp/corpus/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,13 @@
4444
"make_safe_directory_name",
4545
]
4646

47-
import os
48-
4947
from pythainlp.tools import get_full_data_path, get_pythainlp_path
48+
from pythainlp.tools.path import safe_path_join
5049

5150
# Remote and local corpus databases
5251

5352
_CORPUS_DIRNAME: str = "corpus"
54-
_CORPUS_PATH: str = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)
53+
_CORPUS_PATH: str = safe_path_join(get_pythainlp_path(), _CORPUS_DIRNAME)
5554
_CORPUS_DB_URL: str = "https://pythainlp.org/pythainlp-corpus/db.json"
5655

5756
# filename of local corpus catalog

pythainlp/corpus/core.py

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -489,8 +489,9 @@ def _safe_extract_tar(tar: tarfile.TarFile, path: str) -> None:
489489
# Manual validation for older Python versions
490490
for member in tar.getmembers():
491491
# Check the member's target path
492-
member_path = os.path.join(path, member.name)
493-
if not _is_within_directory(path, member_path):
492+
try:
493+
safe_path_join(path, member.name)
494+
except ValueError:
494495
raise ValueError(
495496
f"Attempted path traversal in tar file: {member.name}"
496497
)
@@ -500,21 +501,26 @@ def _safe_extract_tar(tar: tarfile.TarFile, path: str) -> None:
500501
# Get the link target (can be absolute or relative)
501502
link_target = member.linkname
502503

503-
# If it's a relative symlink, resolve it relative to the member's directory
504+
# If it's a relative symlink, resolve it relative to the member's directory.
505+
# Pass the archive-relative dirname and link target as separate parts to
506+
# safe_path_join, which canonicalises and validates containment in one step.
504507
if not os.path.isabs(link_target):
505-
member_dir = os.path.dirname(member_path)
506-
link_target = os.path.join(member_dir, link_target)
508+
try:
509+
safe_path_join(
510+
path, os.path.dirname(member.name), link_target
511+
)
512+
except ValueError:
513+
raise ValueError(
514+
f"Symlink {member.name} points outside extraction directory: {member.linkname}"
515+
)
507516
else:
508517
# Absolute symlinks are dangerous - make them relative to extraction path
509-
link_target = os.path.join(
510-
path, link_target.lstrip(os.sep)
511-
)
512-
513-
# Check if the resolved symlink target is within the directory
514-
if not _is_within_directory(path, link_target):
515-
raise ValueError(
516-
f"Symlink {member.name} points outside extraction directory: {member.linkname}"
517-
)
518+
try:
519+
safe_path_join(path, link_target.lstrip(os.sep))
520+
except ValueError:
521+
raise ValueError(
522+
f"Symlink {member.name} points outside extraction directory: {member.linkname}"
523+
)
518524

519525
tar.extractall(path=path)
520526

@@ -533,8 +539,9 @@ def _safe_extract_zip(zip_file: zipfile.ZipFile, path: str) -> None:
533539
created by Unix-based archiving tools and may not be portable.
534540
"""
535541
for member in zip_file.namelist():
536-
member_path = os.path.join(path, member)
537-
if not _is_within_directory(path, member_path):
542+
try:
543+
safe_path_join(path, member)
544+
except ValueError:
538545
raise ValueError(f"Attempted path traversal in zip file: {member}")
539546

540547
# Check for potential symlinks in ZIP files
@@ -548,21 +555,26 @@ def _safe_extract_zip(zip_file: zipfile.ZipFile, path: str) -> None:
548555
# Read the symlink target from the file content
549556
link_target = zip_file.read(member).decode("utf-8")
550557

551-
# Resolve the link target relative to the member's directory
558+
# Resolve the link target relative to the member's directory.
559+
# Pass the archive-relative dirname and link target as separate parts to
560+
# safe_path_join, which canonicalises and validates containment in one step.
552561
if not os.path.isabs(link_target):
553-
member_dir = os.path.dirname(member_path)
554-
resolved_target = os.path.join(member_dir, link_target)
562+
try:
563+
safe_path_join(
564+
path, os.path.dirname(member), link_target
565+
)
566+
except ValueError:
567+
raise ValueError(
568+
f"Symlink {member} points outside extraction directory: {link_target}"
569+
)
555570
else:
556571
# Absolute symlinks - make them relative to extraction path
557-
resolved_target = os.path.join(
558-
path, link_target.lstrip(os.sep)
559-
)
560-
561-
# Check if the symlink target is within the directory
562-
if not _is_within_directory(path, resolved_target):
563-
raise ValueError(
564-
f"Symlink {member} points outside extraction directory: {link_target}"
565-
)
572+
try:
573+
safe_path_join(path, link_target.lstrip(os.sep))
574+
except ValueError:
575+
raise ValueError(
576+
f"Symlink {member} points outside extraction directory: {link_target}"
577+
)
566578

567579
zip_file.extractall(path=path)
568580

@@ -909,7 +921,7 @@ def get_hf_hub(repo_id: str, filename: str = "") -> str:
909921
raise RuntimeError(f"An unexpected error occurred: {e}") from e
910922
hf_root = get_full_data_path("hf_models")
911923
name_dir = make_safe_directory_name(repo_id)
912-
root_project = os.path.join(hf_root, name_dir)
924+
root_project = safe_path_join(hf_root, name_dir)
913925
if filename:
914926
output_path = hf_hub_download(
915927
repo_id=repo_id, filename=filename, local_dir=root_project

pythainlp/parse/transformers_ud.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
TokenClassificationPipeline,
2222
)
2323

24+
from pythainlp.tools.path import safe_path_join
25+
2426

2527
class Parse:
2628
def __init__(
@@ -44,8 +46,8 @@ def __init__(
4446
x = AutoModelForTokenClassification.from_pretrained
4547
if os.path.isdir(model):
4648
d, t = (
47-
x(os.path.join(model, "deprel")),
48-
x(os.path.join(model, "tagger")),
49+
x(safe_path_join(model, "deprel")),
50+
x(safe_path_join(model, "tagger")),
4951
)
5052
else:
5153
c = AutoConfig.from_pretrained(

pythainlp/spell/words_spelling_correction.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
# SPDX-License-Identifier: Apache-2.0
44
from __future__ import annotations
55

6-
import os
76
from importlib import import_module
87
from typing import TYPE_CHECKING, Union, cast
98

109
from pythainlp.corpus import get_hf_hub
10+
from pythainlp.tools.path import safe_path_join
1111

1212
if TYPE_CHECKING:
1313
import numpy as np
@@ -88,10 +88,10 @@ def _load_embeddings(self) -> tuple[list[str], NDArray[np.float32]]:
8888
import numpy as np
8989

9090
input_matrix = np.load(
91-
os.path.join(self.model_dir, "embeddings.npy"), allow_pickle=False
91+
safe_path_join(self.model_dir, "embeddings.npy"), allow_pickle=False
9292
)
9393
words = []
94-
vocab_path = os.path.join(self.model_dir, "vocabulary.txt")
94+
vocab_path = safe_path_join(self.model_dir, "vocabulary.txt")
9595
with open(vocab_path, encoding="utf-8") as f:
9696
for line in f.readlines():
9797
words.append(line.rstrip())

pythainlp/tag/perceptron.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,25 @@
55

66
from __future__ import annotations
77

8-
import os
98
from typing import Optional
109

1110
from pythainlp.corpus import corpus_path, get_corpus_path
1211
from pythainlp.tag import PerceptronTagger, blackboard, orchid
12+
from pythainlp.tools.path import safe_path_join
1313

1414
_BLACKBOARD_NAME: str = "blackboard_pt_tagger"
1515

1616
_ORCHID_FILENAME: str = "pos_orchid_perceptron.json"
17-
_ORCHID_PATH: str = os.path.join(corpus_path(), _ORCHID_FILENAME)
17+
_ORCHID_PATH: str = safe_path_join(corpus_path(), _ORCHID_FILENAME)
1818

1919
_PUD_FILENAME: str = "pos_ud_perceptron-v0.2.json"
20-
_PUD_PATH: str = os.path.join(corpus_path(), _PUD_FILENAME)
20+
_PUD_PATH: str = safe_path_join(corpus_path(), _PUD_FILENAME)
2121

2222
_TDTB_FILENAME: str = "tdtb-pt_tagger.json"
23-
_TDTB_PATH: str = os.path.join(corpus_path(), _TDTB_FILENAME)
23+
_TDTB_PATH: str = safe_path_join(corpus_path(), _TDTB_FILENAME)
2424

2525
_TUD_FILENAME: str = "pos_tud_perceptron.json"
26-
_TUD_PATH: str = os.path.join(corpus_path(), _TUD_FILENAME)
26+
_TUD_PATH: str = safe_path_join(corpus_path(), _TUD_FILENAME)
2727

2828
_BLACKBOARD_TAGGER: Optional[PerceptronTagger] = None
2929
_ORCHID_TAGGER: Optional[PerceptronTagger] = None

pythainlp/tag/unigram.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,25 @@
66
from __future__ import annotations
77

88
import json
9-
import os
109
from typing import Optional
1110

1211
from pythainlp.corpus import corpus_path, get_corpus_path
1312
from pythainlp.tag import blackboard, orchid
13+
from pythainlp.tools.path import safe_path_join
1414

1515
_ORCHID_FILENAME: str = "pos_orchid_unigram.json"
16-
_ORCHID_PATH: str = os.path.join(corpus_path(), _ORCHID_FILENAME)
16+
_ORCHID_PATH: str = safe_path_join(corpus_path(), _ORCHID_FILENAME)
1717

1818
_PUD_FILENAME: str = "pos_ud_unigram-v0.2.json"
19-
_PUD_PATH: str = os.path.join(corpus_path(), _PUD_FILENAME)
19+
_PUD_PATH: str = safe_path_join(corpus_path(), _PUD_FILENAME)
2020

2121
_TDTB_FILENAME: str = "tdtb-unigram_tagger.json"
22-
_TDTB_PATH: str = os.path.join(corpus_path(), _TDTB_FILENAME)
22+
_TDTB_PATH: str = safe_path_join(corpus_path(), _TDTB_FILENAME)
2323

2424
_BLACKBOARD_NAME: str = "blackboard_unigram_tagger"
2525

2626
_TUD_FILENAME: str = "pos_tud_unigram.json"
27-
_TUD_PATH: str = os.path.join(corpus_path(), _TUD_FILENAME)
27+
_TUD_PATH: str = safe_path_join(corpus_path(), _TUD_FILENAME)
2828

2929
_ORCHID_TAGGER: Optional[dict[str, str]] = None
3030
_PUD_TAGGER: Optional[dict[str, str]] = None

pythainlp/tokenize/crfcut.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@
1717

1818
from __future__ import annotations
1919

20-
import os
21-
2220
import pycrfsuite
2321

2422
from pythainlp.corpus import corpus_path
2523
from pythainlp.tokenize import word_tokenize
24+
from pythainlp.tools.path import safe_path_join
2625

2726
_ENDERS: set[str] = {
2827
# ending honorifics
@@ -176,7 +175,7 @@ def _extract_features(
176175

177176
_CRFCUT_DATA_FILENAME: str = "sentenceseg_crfcut.model"
178177
_tagger: pycrfsuite.Tagger = pycrfsuite.Tagger() # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-attribute]
179-
_tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME))
178+
_tagger.open(safe_path_join(corpus_path(), _CRFCUT_DATA_FILENAME))
180179

181180

182181
def segment(text: str) -> list[str]:

pythainlp/translate/en_th.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
from __future__ import annotations
1212

13-
import os
1413
import warnings
1514
from typing import Optional
1615

@@ -29,6 +28,7 @@
2928
) from e
3029

3130
from pythainlp.corpus import download, get_corpus_path
31+
from pythainlp.tools.path import safe_path_join
3232

3333
_EN_TH_MODEL_NAME: str = "scb_1m_en-th_moses"
3434
# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
@@ -45,7 +45,7 @@ def _get_translate_path(model: str, *path: str) -> str:
4545
corpus_path = get_corpus_path(model, version="1.0")
4646
if not corpus_path:
4747
return ""
48-
return os.path.join(corpus_path, *path)
48+
return safe_path_join(corpus_path, *path)
4949

5050

5151
def _download_install(name: str) -> None:

0 commit comments

Comments
 (0)