From 7c3df9c23f25e0a516ac2856962a1d13538ef8e6 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 19:39:43 +0200
Subject: [PATCH 1/7] feat: report file name of file that chardet fails to read

resolves #3519

Tested and it works now, reporting the file name:

```
codespell --write-changes -i3 -C 5 -H -f -e --count -s --builtin clear,rare,names
Failed to decode file ./pep_sphinx_extensions/tests/pep_lint/test_pep_number.py using detected encoding Windows-1254.
Traceback (most recent call last):
  File "/Users/corneliusromer/micromamba/envs/codespell/bin/codespell", line 8, in <module>
    sys.exit(_script_main())
             ^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1103, in _script_main
    return main(*sys.argv[1:])
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1300, in main
    bad_count += parse_file(
                 ^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 945, in parse_file
    lines, encoding = file_opener.open(filename)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 232, in open
    return self.open_with_chardet(filename)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 246, in open_with_chardet
    lines = self.get_lines(f)
            ^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 303, in get_lines
    lines = f.readlines()
            ^^^^^^^^^^^^^
  File "/Users/corneliusromer/micromamba/envs/codespell/lib/python3.12/encodings/cp1254.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1349: character maps to <undefined>
```
---
 codespell_lib/_codespell.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index c7cc63bcfe..32bd61ad69 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -227,12 +227,12 @@ def init_chardet(self) -> None:
 
         self.encdetector = UniversalDetector()
 
-    def open(self, filename: str) -> Tuple[List[str], str]:
+    def open(self, filename: str) -> Tuple[List[str], str | None]:
         if self.use_chardet:
             return self.open_with_chardet(filename)
         return self.open_with_internal(filename)
 
-    def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
+    def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]:
         self.encdetector.reset()
         with open(filename, "rb") as fb:
             for line in fb:
@@ -241,26 +241,30 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
                     break
         self.encdetector.close()
         encoding = self.encdetector.result["encoding"]
-
+        if not encoding:
+            print(
+                f"WARNING: Chardet failed to detect encoding for file {filename}.",
+                file=sys.stderr,
+            )
         try:
-            f = open(filename, encoding=encoding, newline="")
+            with open(filename, encoding=encoding, newline="") as f:
+                lines = self.get_lines(f)
         except UnicodeDecodeError:
-            print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
+            error_msg = (
+                f"Failed to decode file {filename} using detected "
+                f"encoding {encoding}."
+            )
+            print(error_msg, file=sys.stderr)
             raise
         except LookupError:
-            print(
-                f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
-                file=sys.stderr,
-            )
+            error_msg = f"Unknown encoding {encoding} detected for file {filename}."
+            print(error_msg, file=sys.stderr)
             raise
-        else:
-            lines = self.get_lines(f)
-            f.close()
 
-        return lines, f.encoding
+        return lines, encoding
 
     def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
-        encoding = None
+        encoding: str
         first_try = True
         for encoding in ("utf-8", "iso-8859-1"):
             if first_try:
@@ -887,10 +891,10 @@ def parse_file(
     bad_count = 0
     lines = None
     changed = False
+    encoding: str | None = "utf-8"
 
     if filename == "-":
         f = sys.stdin
-        encoding = "utf-8"
         lines = f.readlines()
     else:
         if options.check_filenames:

From 6d27fefe802959487c6a033c452fa170e035481f Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 19:43:57 +0200
Subject: [PATCH 2/7] Make it python < 3.10 compatible with `Optional[]`
 instead of `| None`

---
 codespell_lib/_codespell.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 32bd61ad69..474b12f63e 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -227,12 +227,12 @@ def init_chardet(self) -> None:
 
         self.encdetector = UniversalDetector()
 
-    def open(self, filename: str) -> Tuple[List[str], str | None]:
+    def open(self, filename: str) -> Tuple[List[str], Optional[str]]:
         if self.use_chardet:
             return self.open_with_chardet(filename)
         return self.open_with_internal(filename)
 
-    def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]:
+    def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]:
         self.encdetector.reset()
         with open(filename, "rb") as fb:
             for line in fb:

From f74ceda8e405c34242bf7e39efac63d4c99b3351 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 23:29:37 +0200
Subject: [PATCH 3/7] Alternative to address @DimitriPapadopoulos' concerns re
 too many changes.

We require the type info, otherwise mypy fails
---
 codespell_lib/_codespell.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 474b12f63e..7caa7e2488 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -873,7 +873,7 @@ def apply_uri_ignore_words(
     return check_matches
 
 
-def parse_file(
+def parse_file(  # noqa: PLR0915
     filename: str,
     colors: TermColors,
     summary: Optional[Summary],
@@ -891,9 +891,10 @@ def parse_file(
     bad_count = 0
     lines = None
     changed = False
-    encoding: str | None = "utf-8"
+    encoding: Optional[str]
 
     if filename == "-":
+        encoding = "utf-8"
         f = sys.stdin
         lines = f.readlines()
     else:

From 53ce7ea35199840419b6e142b28bff5a4c41ecfc Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 23:30:32 +0200
Subject: [PATCH 4/7] REduce diff size

---
 codespell_lib/_codespell.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 7caa7e2488..7b7fe1e38e 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -894,8 +894,8 @@ def parse_file(  # noqa: PLR0915
     encoding: Optional[str]
 
     if filename == "-":
-        encoding = "utf-8"
         f = sys.stdin
+        encoding = "utf-8"
         lines = f.readlines()
     else:
         if options.check_filenames:

From 283d0cd7840a3968f3fd8c58db6385db39568266 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 23:31:21 +0200
Subject: [PATCH 5/7] Remove unnecessary type declaration

---
 codespell_lib/_codespell.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 7b7fe1e38e..bb4d1ec4c3 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -264,7 +264,6 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]:
         return lines, encoding
 
     def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
-        encoding: str
         first_try = True
         for encoding in ("utf-8", "iso-8859-1"):
             if first_try:

From 3159f7135e060ee9737e9ed7753423f6a68a4e5c Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sun, 18 Aug 2024 23:56:01 +0200
Subject: [PATCH 6/7] Cleaner formatting

---
 codespell_lib/_codespell.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index bb4d1ec4c3..22af663ff5 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -249,15 +249,12 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]:
         try:
             with open(filename, encoding=encoding, newline="") as f:
                 lines = self.get_lines(f)
-        except UnicodeDecodeError:
-            error_msg = (
-                f"Failed to decode file {filename} using detected "
-                f"encoding {encoding}."
-            )
+        except LookupError:  # Raised by open() if encoding is unknown
+            error_msg = f"ERROR: Chardet returned unknown encoding for: {filename}."
             print(error_msg, file=sys.stderr)
             raise
-        except LookupError:
-            error_msg = f"Unknown encoding {encoding} detected for file {filename}."
+        except UnicodeDecodeError:  # Raised by self.get_lines() if decoding fails
+            error_msg = f"ERROR: Failed decoding file: {filename}"
             print(error_msg, file=sys.stderr)
             raise
 

From c01a2bd6e8cfd9c50ebece42f433c90bada3dcc6 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Mon, 19 Aug 2024 01:41:21 +0200
Subject: [PATCH 7/7] Add tests

---
 codespell_lib/tests/test_basic.py | 45 ++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index 74e10404e1..eace30ab62 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -569,14 +569,51 @@ def test_encoding(
     assert "WARNING: Binary file" in stderr
 
 
-def test_unknown_encoding_chardet(
+def test_chardet_exceptions(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],
 ) -> None:
-    """Test opening a file with unknown encoding using chardet"""
+    """Test encoding handling with chardet exceptions."""
     fname = tmp_path / "tmp"
-    fname.touch()
-    assert cs.main("--hard-encoding-detection", fname) == 0
+    fname.write_bytes("naïve\n".encode())
+    with mock.patch(
+        "chardet.universaldetector.UniversalDetector"
+    ) as mock_detector_class:
+        # Configure the mock to simulate an incorrect encoding detection
+        mock_detector = mock.MagicMock()
+        mock_detector.result = {"encoding": None}
+        mock_detector.done = True
+        mock_detector_class.return_value = mock_detector
+
+        # Simulate chardet not detecting any encoding
+        result = cs.main("-e", fname, std=True, count=False)
+        assert isinstance(result, tuple)
+        code, stdout, stderr = result
+        assert code == 0
+        assert not stdout
+        assert "WARNING: Chardet failed to detect encoding" in stderr
+        assert str(fname) in stderr
+
+        # Simulate chardet falsely detecting utf-8, instead of the correct iso-8859-1
+        mock_detector.result = {"encoding": "utf-8"}  # Simulate wrong encoding detected
+        mock_detector_class.return_value = mock_detector
+        fname.write_bytes(b"Speling error, non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9\n")
+        with pytest.raises(UnicodeDecodeError) as exc_info_unicode:
+            cs.main("-e", fname, std=True, count=False)
+        stderr = capsys.readouterr().err
+        assert "ERROR: Failed decoding file:" in stderr
+        assert str(fname) in stderr
+        assert "utf-8" in str(exc_info_unicode.value)
+
+        # Simulate chardet detecting non-existent encoding
+        mock_detector.result = {"encoding": "UTF-doesnotexist"}
+        mock_detector_class.return_value = mock_detector
+        with pytest.raises(LookupError) as exc_info_lookup:
+            cs.main("-e", fname, std=True, count=False)
+        stderr = capsys.readouterr().err
+        assert "ERROR: Chardet returned unknown encoding" in stderr
+        assert str(fname) in stderr
+        assert "UTF-doesnotexist" in str(exc_info_lookup.value)
 
 
 def test_ignore(