From 7c3df9c23f25e0a516ac2856962a1d13538ef8e6 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 19:39:43 +0200 Subject: [PATCH 1/7] feat: report file name of file that chardet fails to read resolves #3519 Tested and it works now, reporting the file name: ``` codespell --write-changes -i3 -C 5 -H -f -e --count -s --builtin clear,rare,names Failed to decode file ./pep_sphinx_extensions/tests/pep_lint/test_pep_number.py using detected encoding Windows-1254. Traceback (most recent call last): File "/Users/corneliusromer/micromamba/envs/codespell/bin/codespell", line 8, in sys.exit(_script_main()) ^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1103, in _script_main return main(*sys.argv[1:]) ^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1300, in main bad_count += parse_file( ^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 945, in parse_file lines, encoding = file_opener.open(filename) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 232, in open return self.open_with_chardet(filename) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 246, in open_with_chardet lines = self.get_lines(f) ^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 303, in get_lines lines = f.readlines() ^^^^^^^^^^^^^ File "/Users/corneliusromer/micromamba/envs/codespell/lib/python3.12/encodings/cp1254.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1349: character maps to ``` --- codespell_lib/_codespell.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index c7cc63bcfe..32bd61ad69 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -227,12 +227,12 @@ def init_chardet(self) -> None: self.encdetector = UniversalDetector() - def open(self, filename: str) -> Tuple[List[str], str]: + def open(self, filename: str) -> Tuple[List[str], str | None]: if self.use_chardet: return self.open_with_chardet(filename) return self.open_with_internal(filename) - def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: + def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]: self.encdetector.reset() with open(filename, "rb") as fb: for line in fb: @@ -241,26 +241,30 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: break self.encdetector.close() encoding = self.encdetector.result["encoding"] - + if not encoding: + print( + f"WARNING: Chardet failed to detect encoding for file {filename}.", + file=sys.stderr, + ) try: - f = open(filename, encoding=encoding, newline="") + with open(filename, encoding=encoding, newline="") as f: + lines = self.get_lines(f) except UnicodeDecodeError: - print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr) + error_msg = ( + f"Failed to decode file {filename} using detected " + f"encoding {encoding}." + ) + print(error_msg, file=sys.stderr) raise except LookupError: - print( - f"ERROR: Don't know how to handle encoding {encoding}: {filename}", - file=sys.stderr, - ) + error_msg = f"Unknown encoding {encoding} detected for file {filename}." + print(error_msg, file=sys.stderr) raise - else: - lines = self.get_lines(f) - f.close() - return lines, f.encoding + return lines, encoding def open_with_internal(self, filename: str) -> Tuple[List[str], str]: - encoding = None + encoding: str first_try = True for encoding in ("utf-8", "iso-8859-1"): if first_try: @@ -887,10 +891,10 @@ def parse_file( bad_count = 0 lines = None changed = False + encoding: str | None = "utf-8" if filename == "-": f = sys.stdin - encoding = "utf-8" lines = f.readlines() else: if options.check_filenames: From 6d27fefe802959487c6a033c452fa170e035481f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 19:43:57 +0200 Subject: [PATCH 2/7] Make it python < 3.10 compatible with `Optional[]` instead of `| None` --- codespell_lib/_codespell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 32bd61ad69..474b12f63e 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -227,12 +227,12 @@ def init_chardet(self) -> None: self.encdetector = UniversalDetector() - def open(self, filename: str) -> Tuple[List[str], str | None]: + def open(self, filename: str) -> Tuple[List[str], Optional[str]]: if self.use_chardet: return self.open_with_chardet(filename) return self.open_with_internal(filename) - def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]: + def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]: self.encdetector.reset() with open(filename, "rb") as fb: for line in fb: From f74ceda8e405c34242bf7e39efac63d4c99b3351 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 23:29:37 +0200 Subject: [PATCH 3/7] Alternative to address @DimitriPapadopoulos' concerns re too many changes. We require the type info, otherwise mypy fails --- codespell_lib/_codespell.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 474b12f63e..7caa7e2488 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -873,7 +873,7 @@ def apply_uri_ignore_words( return check_matches -def parse_file( +def parse_file( # noqa: PLR0915 filename: str, colors: TermColors, summary: Optional[Summary], @@ -891,9 +891,10 @@ def parse_file( bad_count = 0 lines = None changed = False - encoding: str | None = "utf-8" + encoding: Optional[str] if filename == "-": + encoding = "utf-8" f = sys.stdin lines = f.readlines() else: From 53ce7ea35199840419b6e142b28bff5a4c41ecfc Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 23:30:32 +0200 Subject: [PATCH 4/7] REduce diff size --- codespell_lib/_codespell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 7caa7e2488..7b7fe1e38e 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -894,8 +894,8 @@ def parse_file( # noqa: PLR0915 encoding: Optional[str] if filename == "-": - encoding = "utf-8" f = sys.stdin + encoding = "utf-8" lines = f.readlines() else: if options.check_filenames: From 283d0cd7840a3968f3fd8c58db6385db39568266 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 23:31:21 +0200 Subject: [PATCH 5/7] Remove unnecessary type declaration --- codespell_lib/_codespell.py | 1 - 1 file changed, 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 7b7fe1e38e..bb4d1ec4c3 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -264,7 +264,6 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]: return lines, encoding def open_with_internal(self, filename: str) -> Tuple[List[str], str]: - encoding: str first_try = True for encoding in ("utf-8", "iso-8859-1"): if first_try: From 3159f7135e060ee9737e9ed7753423f6a68a4e5c Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 23:56:01 +0200 Subject: [PATCH 6/7] Cleaner formatting --- codespell_lib/_codespell.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index bb4d1ec4c3..22af663ff5 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -249,15 +249,12 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]: try: with open(filename, encoding=encoding, newline="") as f: lines = self.get_lines(f) - except UnicodeDecodeError: - error_msg = ( - f"Failed to decode file {filename} using detected " - f"encoding {encoding}." - ) + except LookupError: # Raised by open() if encoding is unknown + error_msg = f"ERROR: Chardet returned unknown encoding for: {filename}." print(error_msg, file=sys.stderr) raise - except LookupError: - error_msg = f"Unknown encoding {encoding} detected for file {filename}." + except UnicodeDecodeError: # Raised by self.get_lines() if decoding fails + error_msg = f"ERROR: Failed decoding file: {filename}" print(error_msg, file=sys.stderr) raise From c01a2bd6e8cfd9c50ebece42f433c90bada3dcc6 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 19 Aug 2024 01:41:21 +0200 Subject: [PATCH 7/7] Add tests --- codespell_lib/tests/test_basic.py | 45 ++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 74e10404e1..eace30ab62 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -569,14 +569,51 @@ def test_encoding( assert "WARNING: Binary file" in stderr -def test_unknown_encoding_chardet( +def test_chardet_exceptions( tmp_path: Path, capsys: pytest.CaptureFixture[str], ) -> None: - """Test opening a file with unknown encoding using chardet""" + """Test encoding handling with chardet exceptions.""" fname = tmp_path / "tmp" - fname.touch() - assert cs.main("--hard-encoding-detection", fname) == 0 + fname.write_bytes("naïve\n".encode()) + with mock.patch( + "chardet.universaldetector.UniversalDetector" + ) as mock_detector_class: + # Configure the mock to simulate an incorrect encoding detection + mock_detector = mock.MagicMock() + mock_detector.result = {"encoding": None} + mock_detector.done = True + mock_detector_class.return_value = mock_detector + + # Simulate chardet not detecting any encoding + result = cs.main("-e", fname, std=True, count=False) + assert isinstance(result, tuple) + code, stdout, stderr = result + assert code == 0 + assert not stdout + assert "WARNING: Chardet failed to detect encoding" in stderr + assert str(fname) in stderr + + # Simulate chardet falsely detecting utf-8, instead of the correct iso-8859-1 + mock_detector.result = {"encoding": "utf-8"} # Simulate wrong encoding detected + mock_detector_class.return_value = mock_detector + fname.write_bytes(b"Speling error, non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9\n") + with pytest.raises(UnicodeDecodeError) as exc_info_unicode: + cs.main("-e", fname, std=True, count=False) + stderr = capsys.readouterr().err + assert "ERROR: Failed decoding file:" in stderr + assert str(fname) in stderr + assert "utf-8" in str(exc_info_unicode.value) + + # Simulate chardet detecting non-existent encoding + mock_detector.result = {"encoding": "UTF-doesnotexist"} + mock_detector_class.return_value = mock_detector + with pytest.raises(LookupError) as exc_info_lookup: + cs.main("-e", fname, std=True, count=False) + stderr = capsys.readouterr().err + assert "ERROR: Chardet returned unknown encoding" in stderr + assert str(fname) in stderr + assert "UTF-doesnotexist" in str(exc_info_lookup.value) def test_ignore(