Skip to content

Commit

Permalink
phone-search analyzer: don't emit extension & unformatted input
Browse files Browse the repository at this point in the history
if these tokens are emitted it meant that phone numbers with other
international dialling prefixes still matched.

e.g. searching for `+1 1234` would also match a number stored as
`+2 1234`, which was wrong.

the tokens still need to be emited for the `phone` analyzer, e.g. when
the user only enters the extension / local number it should still match,
the same is with the other ngrams: these are needed for
search-as-you-type style queries where the user input needs to match
against partial phone numbers.

Signed-off-by: Ralph Ursprung <[email protected]>
  • Loading branch information
rursprung committed Jan 10, 2025
1 parent 107596e commit ff3c8da
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868))
- Fix multi-value sort for unsigned long ([#16732](https://github.com/opensearch-project/OpenSearch/pull/16732))
- The `phone-search` analyzer no longer emits the international calling code as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993))
- The `phone-search` analyzer no longer emits extension numbers and unformatted input as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,18 +128,23 @@ private Set<String> getTokens() throws IOException {
countryCode = Optional.of(String.valueOf(numberProto.getCountryCode()));
input = String.valueOf(numberProto.getNationalNumber());

// add full number as tokens
tokens.add(countryCode.get() + input);

if (addNgrams) {
// Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the
// same country code
tokens.add(countryCode.get());
}
// Add extension, and the number as tokens
tokens.add(countryCode.get() + input);
if (!Strings.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());

// Add extension without country code (not done for search analyzer as that might match numbers in other countries as
// well!)
if (!Strings.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
}
// Add unformatted input (most likely the same as the extension now since the prefix has been removed)
tokens.add(input);
}

tokens.add(input);
}
} catch (final NumberParseException | StringIndexOutOfBoundsException e) {
// Libphone didn't like it, no biggie. We'll just ngram the number as it is.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,7 @@ public void testEuropeDetailled() throws IOException {
* Test for all tokens which are emitted by the "phone" analyzer.
*/
public void testEuropeDetailledSearch() throws IOException {
assertTokensAreInAnyOrder(
phoneSearchAnalyzer,
"tel:+441344840400",
Arrays.asList("tel:+441344840400", "tel:", "441344840400", "1344840400")
);
assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+441344840400", Arrays.asList("tel:+441344840400", "tel:", "441344840400"));
}

public void testEurope() throws IOException {
Expand Down Expand Up @@ -189,21 +185,21 @@ public void testLocalNumberWithCH() throws IOException {
}

public void testSearchInternationalPrefixWithZZ() throws IOException {
assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010"));
}

public void testSearchInternationalPrefixWithCH() throws IOException {
assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010"));
}

public void testSearchNationalPrefixWithCH() throws IOException {
// + is equivalent to 00 in Switzerland
assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("0041583161010", "41583161010"));
}

public void testSearchLocalNumberWithCH() throws IOException {
// when omitting the international prefix swiss numbers must start with '0'
assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("0583161010", "41583161010"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@
index:
index: test
id: 3
# number not used in the examples below, just present to make sure that it's never matched
body: { "phone": "+41 12 345 67 89", "phone-ch": "012 345 67 89" }
- do:
index:
index: test
id: 4
# germany has a different phone number length, but for this test we ignore it and pretend they're the same
body: { "phone": "+49 58 316 10 10", "phone-ch": "+49 58 316 10 10" }
- do:
index:
index: test
id: 5
body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" }
- do:
indices.refresh: {}
Expand Down Expand Up @@ -89,6 +101,28 @@
"phone-ch": "0583161010"
- match: { hits.total: 1 }

# search-as-you-type style query
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "+4158316"
- match: { hits.total: 2 }

# search-as-you-type style query
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "058316"
- match: { hits.total: 2 }

# international format in document & search will always work
- do:
search:
Expand Down

0 comments on commit ff3c8da

Please sign in to comment.