Skip to content

Commit

Permalink
Fix problems with umlauts and non-normalized characters
Browse files Browse the repository at this point in the history
  • Loading branch information
MMore committed May 9, 2022
1 parent e60af00 commit 734d326
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 15 deletions.
13 changes: 4 additions & 9 deletions lib/full_name_splitter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ defmodule FullNameSplitter do
def split(nil), do: {nil, nil}

def split(full_name) do
{:ok, tokens, _} = full_name |> to_charlist() |> :full_name_lexer.string()

{first_name, last_name} =
try do
{:ok, tokens, _} =
full_name |> String.normalize(:nfd) |> to_charlist() |> :full_name_lexer.string()

{:ok, result} = :full_name_parser.parse(tokens)
result
rescue
Expand All @@ -23,12 +24,6 @@ defmodule FullNameSplitter do
{other_names, last_name}
end

{convert_invalid_binary(first_name), convert_invalid_binary(last_name)}
end

defp convert_invalid_binary(nil), do: nil

defp convert_invalid_binary(binary) do
:iconv.convert("iso8859-15", "utf-8", binary)
{first_name, last_name}
end
end
3 changes: 1 addition & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ defmodule FullNameSplitter.MixProject do
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:mix_test_watch, "~> 1.0", only: :dev, runtime: false},
{:iconv, "~> 1.0"}
{:mix_test_watch, "~> 1.0", only: :dev, runtime: false}
]
end
end
1 change: 0 additions & 1 deletion mix.lock
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
%{
"file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"iconv": {:hex, :iconv, "1.0.13", "98e723a14a1942e94cb32837ee875d145a7f2266f2c79816a54f1aa77ebff495", [:rebar3], [], "hexpm", "bcb37d52da24161f2c2beeb9234b62e76562919149cc2fc0d3537ac0cef5b977"},
"mix_test_watch": {:hex, :mix_test_watch, "1.1.0", "330bb91c8ed271fe408c42d07e0773340a7938d8a0d281d57a14243eae9dc8c3", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "52b6b1c476cbb70fd899ca5394506482f12e5f6b0d6acff9df95c7f1e0812ec3"},
}
9 changes: 6 additions & 3 deletions test/full_name_splitter_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,14 @@ defmodule FullNameSplitterTest do
end

test "splits umlauts" do
assert FullNameSplitter.split("Jàáâã Martíñ Müller") == {"Jàáâã Martíñ", "Müller"}
assert FullNameSplitter.split("Jake Ciepły") == {"Jake", "Ciepły"}
assert FullNameSplitter.split("Jacob Müller") == {"Jacob", "Müller"}
assert FullNameSplitter.split("Cadro Ćwikliński") == {"Cadro", "Ćwikliński"}
end

test "splits unrecognized names in a simply way as fallback" do
assert FullNameSplitter.split("Maria del Carmen Menendez") == {"Maria del Carmen", "Menendez"}
test "splits unrecognized or invalid input in a simply way as fallback" do
assert FullNameSplitter.split("Maria Martín del Carmen Menendez") ==
{"Maria Martín del Carmen", "Menendez"}
end

test "ignores whitespaces while splitting" do
Expand Down

0 comments on commit 734d326

Please sign in to comment.