From 31f2bf8542d16f70dd43335b3f28e79be0f6a39a Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:16:41 +0100 Subject: [PATCH] [DOCS] Puts lang ident example back. (#2608) (#2611) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit 94237ee60b712ffdee56900453773679730325e6) Co-authored-by: István Zoltán Szabó --- .../stack/ml/nlp/ml-nlp-lang-ident.asciidoc | 116 +++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/docs/en/stack/ml/nlp/ml-nlp-lang-ident.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-lang-ident.asciidoc index 0b1878dcb..c6650e695 100644 --- a/docs/en/stack/ml/nlp/ml-nlp-lang-ident.asciidoc +++ b/docs/en/stack/ml/nlp/ml-nlp-lang-ident.asciidoc @@ -24,9 +24,10 @@ language traditionally uses. These languages are marked in the supported languages table (see below) with the `Latn` subtag. {lang-ident-cap} supports Unicode input. + [discrete] [[ml-lang-ident-supported-languages]] -=== Supported languages +== Supported languages The table below contains the ISO codes and the English names of the languages that {lang-ident} supports. If a language has a 2-letter `ISO 639-1` code, the @@ -82,8 +83,119 @@ script. //// + +[discrete] +[[ml-lang-ident-example]] +== Example of {lang-ident} + +In the following example, we feed the {lang-ident} trained model a short +Hungarian text that contains diacritics and a couple of English words. The +model identifies the text correctly as Hungarian with high probability. + +[source,js] +---------------------------------- +POST _ingest/pipeline/_simulate +{ + "pipeline":{ + "processors":[ + { + "inference":{ + "model_id":"lang_ident_model_1", <1> + "inference_config":{ + "classification":{ + "num_top_classes":5 <2> + } + }, + "field_map":{ + } + } + } + ] + }, + "docs":[ + { + "_source":{ <3> + "text":"Sziasztok! Ez egy rövid magyar szöveg. Nézzük, vajon sikerül-e azonosítania a language identification funkciónak? Annak ellenére is sikerülni fog, hogy a szöveg két angol szót is tartalmaz." + } + } + ] +} +---------------------------------- +//NOTCONSOLE + +<1> ID of the {lang-ident} trained model. +<2> Specifies the number of languages to report by descending order of +probability. +<3> The source object that contains the text to identify. + + +In the example above, the `num_top_classes` value indicates that only the top +five languages (that is to say, the ones with the highest probability) are +reported. + +The request returns the following response: + +[source,js] +---------------------------------- +{ + "docs" : [ + { + "doc" : { + "_index" : "_index", + "_type" : "_doc", + "_id" : "_id", + "_source" : { + "text" : "Sziasztok! Ez egy rövid magyar szöveg. Nézzük, vajon sikerül-e azonosítania a language identification funkciónak? Annak ellenére is sikerülni fog, hogy a szöveg két angol szót is tartalmaz.", + "ml" : { + "inference" : { + "top_classes" : [ <1> + { + "class_name" : "hu", + "class_probability" : 0.9999936063740517, + "class_score" : 0.9999936063740517 + }, + { + "class_name" : "lv", + "class_probability" : 2.5020248433413966E-6, + "class_score" : 2.5020248433413966E-6 + }, + { + "class_name" : "is", + "class_probability" : 1.0150420723037688E-6, + "class_score" : 1.0150420723037688E-6 + }, + { + "class_name" : "ga", + "class_probability" : 6.67935962773335E-7, + "class_score" : 6.67935962773335E-7 + }, + { + "class_name" : "tr", + "class_probability" : 5.591166324774555E-7, + "class_score" : 5.591166324774555E-7 + } + ], + "predicted_value" : "hu", <2> + "model_id" : "lang_ident_model_1" + } + } + }, + "_ingest" : { + "timestamp" : "2020-01-22T14:25:14.644912Z" + } + } + } + ] +} +---------------------------------- +//NOTCONSOLE + +<1> Contains scores for the most probable languages. +<2> The ISO identifier of the language with the highest probability. + + [discrete] [[ml-lang-ident-readings]] -=== Further reading +== Further reading * {blog-ref}multilingual-search-using-language-identification-in-elasticsearch[Multilingual search using {lang-ident} in {es}]