Skip to content

Commit 933b99f

Browse files
committed
Handle CRLF in content cleaning layer
1 parent 598607b commit 933b99f

File tree

2 files changed

+85
-26
lines changed

2 files changed

+85
-26
lines changed

lib/json_remedy/layer1/content_cleaning.ex

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -455,17 +455,32 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
455455
nil ->
456456
{input, []}
457457

458-
{start_pos, end_pos, comment_text} ->
458+
{start_pos, end_pos, _comment_text} ->
459459
if not comment_inside_string?(input, start_pos) do
460-
before = String.slice(input, 0, start_pos)
461-
after_comment = String.slice(input, end_pos + 2, String.length(input))
460+
before =
461+
if start_pos > 0 do
462+
binary_part(input, 0, start_pos)
463+
else
464+
""
465+
end
466+
467+
comment_length = end_pos - start_pos + 2
468+
after_start = end_pos + 2
469+
470+
after_comment =
471+
if after_start >= byte_size(input) do
472+
""
473+
else
474+
binary_part(input, after_start, byte_size(input) - after_start)
475+
end
476+
462477
result = before <> after_comment
463478

464479
repair = %{
465480
layer: :content_cleaning,
466481
action: "removed block comment",
467482
position: start_pos,
468-
original: comment_text,
483+
original: binary_part(input, start_pos, comment_length),
469484
replacement: ""
470485
}
471486

@@ -531,18 +546,20 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
531546
defp find_matching_block_comment_end(_input, _pos, 0), do: nil
532547

533548
defp find_substring_position(string, substring, start_offset) do
534-
# Search from the start_offset position
535-
search_string = String.slice(string, start_offset, String.length(string))
536-
537-
case String.split(search_string, substring, parts: 2) do
538-
[before, _after] ->
539-
start_offset + byte_size(before)
549+
total_size = byte_size(string)
540550

541-
[_single_part] ->
551+
cond do
552+
start_offset >= total_size ->
542553
nil
543554

544-
_ ->
545-
nil
555+
true ->
556+
slice_size = total_size - start_offset
557+
slice = binary_part(string, start_offset, slice_size)
558+
559+
case :binary.match(slice, substring) do
560+
{match_start, _length} -> start_offset + match_start
561+
:nomatch -> nil
562+
end
546563
end
547564
end
548565

@@ -643,11 +660,17 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
643660
end
644661

645662
defp extract_balanced_content(input, start_pos, open_char, close_char) do
646-
substring = String.slice(input, start_pos, String.length(input))
663+
total_size = byte_size(input)
647664

648-
case find_balanced_end(substring, open_char, close_char) do
649-
nil -> nil
650-
end_pos -> String.slice(substring, 0, end_pos + 1)
665+
if start_pos >= total_size do
666+
nil
667+
else
668+
substring = binary_part(input, start_pos, total_size - start_pos)
669+
670+
case find_balanced_end(substring, open_char, close_char) do
671+
nil -> nil
672+
end_pos -> binary_part(substring, 0, end_pos + byte_size(close_char))
673+
end
651674
end
652675
end
653676

@@ -667,22 +690,25 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
667690

668691
defp find_balanced_end(<<char::utf8, rest::binary>>, open, close, pos, balance, false)
669692
when <<char::utf8>> == open do
670-
find_balanced_end(rest, open, close, pos + 1, balance + 1, false)
693+
char_size = byte_size(<<char::utf8>>)
694+
find_balanced_end(rest, open, close, pos + char_size, balance + 1, false)
671695
end
672696

673697
defp find_balanced_end(<<char::utf8, rest::binary>>, open, close, pos, balance, false)
674698
when <<char::utf8>> == close do
699+
char_size = byte_size(<<char::utf8>>)
675700
new_balance = balance - 1
676701

677702
if new_balance == 0 do
678703
pos
679704
else
680-
find_balanced_end(rest, open, close, pos + 1, new_balance, false)
705+
find_balanced_end(rest, open, close, pos + char_size, new_balance, false)
681706
end
682707
end
683708

684-
defp find_balanced_end(<<_char::utf8, rest::binary>>, open, close, pos, balance, in_string) do
685-
find_balanced_end(rest, open, close, pos + 1, balance, in_string)
709+
defp find_balanced_end(<<char::utf8, rest::binary>>, open, close, pos, balance, in_string) do
710+
char_size = byte_size(<<char::utf8>>)
711+
find_balanced_end(rest, open, close, pos + char_size, balance, in_string)
686712
end
687713

688714
# Check if a string is valid JSON (not just starts with valid char)
@@ -715,12 +741,13 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
715741
# Find where the JSON structure starts
716742
json_start =
717743
case String.split(input, open_char, parts: 2) do
718-
[prefix, _] -> String.length(prefix)
744+
[prefix, _] -> byte_size(prefix)
719745
_ -> 0
720746
end
721747

722-
# Extract from the JSON start to find the balanced end
723-
substring_from_json = String.slice(input, json_start, String.length(input))
748+
total_size = byte_size(input)
749+
substring_size = total_size - json_start
750+
substring_from_json = binary_part(input, json_start, substring_size)
724751

725752
case find_balanced_end(substring_from_json, open_char, close_char) do
726753
nil ->
@@ -732,7 +759,15 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
732759
json_end = json_start + end_pos + 1
733760

734761
# Check if there's non-whitespace content after JSON ends
735-
after_json = String.slice(input, json_end, String.length(input))
762+
after_size = max(total_size - json_end, 0)
763+
764+
after_json =
765+
if after_size > 0 do
766+
binary_part(input, json_end, after_size)
767+
else
768+
""
769+
end
770+
736771
after_json_trimmed = String.trim(after_json)
737772

738773
cond do
@@ -747,7 +782,7 @@ defmodule JsonRemedy.Layer1.ContentCleaning do
747782

748783
true ->
749784
# Extract only the JSON portion, remove wrapper text
750-
json_content = String.slice(input, 0, json_end)
785+
json_content = binary_part(input, 0, json_end)
751786

752787
repair = %{
753788
layer: :content_cleaning,

test/unit/layer1_content_cleaning_test.exs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,30 @@ defmodule JsonRemedy.Layer1.ContentCleaningTest do
263263
# Should have multiple repairs logged
264264
assert length(context.repairs) >= 2
265265
end
266+
267+
test "handles windows newlines across code fences and comments" do
268+
input =
269+
"Here's your data:\r\n```json\r\n// User data\r\n{\r\n \"name\": \"Alice\",\r\n /* age field */\r\n \"age\": 30\r\n}\r\n```\r\nHope this helps!\r\n"
270+
271+
{:ok, result, _context} = ContentCleaning.process(input, %{repairs: [], options: []})
272+
273+
assert String.contains?(result, "\"name\": \"Alice\"")
274+
refute String.contains?(result, "```")
275+
refute String.contains?(result, "//")
276+
refute String.contains?(result, "/*")
277+
refute String.contains?(result, "Hope this helps!")
278+
end
279+
280+
test "removes trailing wrapper text with windows newlines" do
281+
input = "[\r\n {\"id\": 1}\r\n]\r\n1 Volume(s) created\r\n"
282+
283+
{:ok, result, _context} = ContentCleaning.process(input, %{repairs: [], options: []})
284+
285+
trimmed = String.trim(result)
286+
assert String.starts_with?(trimmed, "[")
287+
assert String.ends_with?(trimmed, "]")
288+
refute String.contains?(result, "1 Volume(s) created")
289+
end
266290
end
267291

268292
describe "LayerBehaviour implementation" do

0 commit comments

Comments
 (0)