From 60bce1f6061ce021ba1072e0322b3df04743ef3f Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 5 May 2022 15:07:09 +0200 Subject: [PATCH 01/24] [fix #122] Fix syntax error in Makefile Add missing parenthesis around variable BIN_DIR when it is used. --- src/build/Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/build/Makefile b/src/build/Makefile index 3f2eccca..b6eec6d4 100644 --- a/src/build/Makefile +++ b/src/build/Makefile @@ -2587,18 +2587,18 @@ no_o_files: .libunitex: no_o_files $(OBJS) Test_lib.o $(RM) Main_*.o - $(CC) *.o $(OPTIONS) $(BUILD_LIB_OPTIONS) -shared -o $BIN_DIR/libunitex.$(DYLIBEXT) - $(CC) Test_lib.o -L$BIN_DIR -Wl,-rpath,. -lunitex $(OPTIONS) $(TEST_LIB_OPTIONS) -o $BIN_DIR/Test_lib + $(CC) *.o $(OPTIONS) $(BUILD_LIB_OPTIONS) -shared -o $(BIN_DIR)/libunitex.$(DYLIBEXT) + $(CC) Test_lib.o -L$(BIN_DIR) -Wl,-rpath,. -lunitex $(OPTIONS) $(TEST_LIB_OPTIONS) -o $(BIN_DIR)/Test_lib .libunitexjni: no_o_files $(UNITEX_JNI_OBJS) $(OBJS) Test_lib.o $(RM) Main_*.o - $(CC) *.o $(OPTIONS) $(BUILD_LIB_OPTIONS) -shared -o $BIN_DIR/libUnitexJni.$(DYLIBEXT) - jar cvf $BIN_DIR/UnitexJni.jar -C ../UnitexLibAndJni/ fr/umlv/unitex/jni/UnitexJni.class - $(CC) Test_lib.o -L$BIN_DIR -Wl,-rpath,. -lUnitexJni $(OPTIONS) $(TEST_LIB_OPTIONS) -o $BIN_DIR/Test_lib + $(CC) *.o $(OPTIONS) $(BUILD_LIB_OPTIONS) -shared -o $(BIN_DIR)/libUnitexJni.$(DYLIBEXT) + jar cvf $(BIN_DIR)/UnitexJni.jar -C ../UnitexLibAndJni/ fr/umlv/unitex/jni/UnitexJni.class + $(CC) Test_lib.o -L$(BIN_DIR) -Wl,-rpath,. -lUnitexJni $(OPTIONS) $(TEST_LIB_OPTIONS) -o $(BIN_DIR)/Test_lib .libunitexstatic: $(OBJS) $(RM) Main_*.o - $(AR) rcs $BIN_DIR/libunitex.a *.o + $(AR) rcs $(BIN_DIR)/libunitex.a *.o touch .libunitexstatic else @@ -2608,18 +2608,18 @@ no_o_files: .libunitex: no_o_files $(LIBTRE) $(LIBLUAJIT) $(OBJS) Test_lib.o $(RM) Main_*.o - $(CC) *.o -shared -Wl,--export-all-symbols -L"." $(BUILD_LIB_OPTIONS) $(TRE_LINK_LIB) $(ADDITIONAL_LIB1) $(ADDITIONAL_LIB2) -o $BIN_DIR/unitex.dll - $(CC) Test_lib.o $(OPTIONS) $(TEST_LIB_OPTIONS) $BIN_DIR/unitex.dll -o $BIN_DIR/Test_lib.exe + $(CC) *.o -shared -Wl,--export-all-symbols -L"." $(BUILD_LIB_OPTIONS) $(TRE_LINK_LIB) $(ADDITIONAL_LIB1) $(ADDITIONAL_LIB2) -o $(BIN_DIR)/unitex.dll + $(CC) Test_lib.o $(OPTIONS) $(TEST_LIB_OPTIONS) $(BIN_DIR)/unitex.dll -o $(BIN_DIR)/Test_lib.exe .libunitexjni: no_o_files $(UNITEX_JNI_OBJS) $(UNITEXLIB_IO_OBJS) $(YAML_LINK_OBJS) $(UNITEXLIB_PACK_RUNLOG_OBJS) $(LIBTRE) $(LIBLUAJIT) $(OBJS) Test_lib.o -$(RM) Main_*.o - $(CC) *.o -shared -Wl,--export-all-symbols,--kill-at -L"." $(BUILD_LIB_OPTIONS) $(TRE_LINK_LIB) $(ADDITIONAL_LIB1) $(ADDITIONAL_LIB2) -o $BIN_DIR/UnitexJni.dll - jar cvf $BIN_DIR/UnitexJni.jar -C ../UnitexLibAndJni/ fr/umlv/unitex/jni/UnitexJni.class - $(CC) Test_lib.o $(OPTIONS) $(TEST_LIB_OPTIONS) $BIN_DIR/UnitexJni.dll -o $BIN_DIR/Test_lib.exe + $(CC) *.o -shared -Wl,--export-all-symbols,--kill-at -L"." $(BUILD_LIB_OPTIONS) $(TRE_LINK_LIB) $(ADDITIONAL_LIB1) $(ADDITIONAL_LIB2) -o $(BIN_DIR)/UnitexJni.dll + jar cvf $(BIN_DIR)/UnitexJni.jar -C ../UnitexLibAndJni/ fr/umlv/unitex/jni/UnitexJni.class + $(CC) Test_lib.o $(OPTIONS) $(TEST_LIB_OPTIONS) $(BIN_DIR)/UnitexJni.dll -o $(BIN_DIR)/Test_lib.exe .libunitexstatic: $(UNITEXLIB_IO_OBJS) $(YAML_LINK_OBJS) $(UNITEXLIB_PACK_RUNLOG_OBJS) $(OBJS) $(RM) Main_*.o - $(AR) rcs $BIN_DIR/libunitex.a *.o + $(AR) rcs $(BIN_DIR)/libunitex.a *.o touch .libunitexstatic endif From c320d3a9c19455f13e86a08aa25e5176ea298d92 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Tue, 5 Jul 2022 17:24:16 +0200 Subject: [PATCH 02/24] [feature #51] Add a parser for the configuration file --- src/Multi2Delaf.cpp | 519 ++++++++++++++++++++++++++++++++++++++++++++ src/Multi2Delaf.h | 139 ++++++++++++ 2 files changed, 658 insertions(+) create mode 100644 src/Multi2Delaf.cpp create mode 100644 src/Multi2Delaf.h diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp new file mode 100644 index 00000000..9bb95722 --- /dev/null +++ b/src/Multi2Delaf.cpp @@ -0,0 +1,519 @@ +/** + * Unitex + * + * Copyright (C) 2001-2021 Université Paris-Est Marne-la-Vallée + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include "Multi2Delaf.h" + +#include +#include +#include + +#include "File.h" +#include "Pattern.h" +#include "Unicode.h" + + +#ifndef HAS_UNITEX_NAMESPACE +#define HAS_UNITEX_NAMESPACE 1 +#endif + + +#define INPUTSIZEBUFFER 4096 + +namespace unitex { + + +/*================================================================= + * ConfigCommand class method + *================================================================= */ + +/** + * Tokenize a config command. + * Assumes that str ends by '\0'. + * Assumes that empty space are removed from the begin, + * using ConfigLine::advances_to_next_no_blank_char. + * Raise a fatal error in case of malformed command. + */ +std::unique_ptr ConfigCommand::tokenize_config_command( + unichar* str, const char* config_filename) { + unichar* lemma = nullptr; + unichar* part_of_speech = nullptr; + struct list_ustring* semantic_codes = nullptr; + struct list_ustring* inflectional_codes = nullptr; + unichar* ptr = str; + if (str[0] == ',') { + lemma = tokenize_lemma(&ptr, config_filename); + } + if (ptr[0] == '.') { + part_of_speech = tokenize_part_of_speech(&ptr, config_filename); + } + if (ptr[0] == '+') { + semantic_codes = tokenize_semantic_codes(&ptr, config_filename); + } + if (ptr[0] == ':') { + inflectional_codes = tokenize_inflectional_codes(ptr, config_filename); + } + return std::make_unique(lemma, part_of_speech, semantic_codes, + inflectional_codes); +} + +ConfigCommand::ConfigCommand(unichar* lemma, unichar* part_of_speech, + struct list_ustring* semantic_codes, + struct list_ustring* inflectional_codes) + : _lemma{lemma}, + _part_of_speech{part_of_speech}, + _semantic_codes{semantic_codes}, + _inflectional_codes{inflectional_codes} { +} + +ConfigCommand::~ConfigCommand() { + if (_lemma) { + free(_lemma); + } + if (_part_of_speech) { + free(_part_of_speech); + } + if (_semantic_codes) { + free_list_ustring(_semantic_codes); + } + if (_inflectional_codes) { + free_list_ustring(_inflectional_codes); + } +} + +unichar* ConfigCommand::get_lemma() const { + return _lemma; +} + +unichar* ConfigCommand::get_part_of_speech() const { + return _part_of_speech; +} + +/** + * Return a new allocated unicode string describing the lemma. + * Set *ptr to the next unread character. + * Assumes that (*ptr)[0] == ','. + */ +unichar* ConfigCommand::tokenize_lemma(unichar** ptr, + const char* config_filename) { + unichar* line = *ptr; + // try to read ,,copy + if (line[0] == ',' && line[1] == ',') { + if (!u_starts_with(line + 1, COMMA_COPY)) { + fatal_error("Double ',' in file: %s, line: `%S`\n", config_filename, + line); + } + *ptr = line + 1 + strlen(COMMA_COPY); + if (**ptr != '\0' && **ptr != '.' && **ptr != '+' && **ptr != ':') { + fatal_error("Double ',' in file: %s, line: `%S`\n", config_filename, + line); + } + return u_strdup(COMMA_COPY); + } + int i = 1; + while (line[i] != '\0' && line[i] != '.' && line[i] != '+' && + line[i] != ':') { + i++; + } + *ptr = line + i; + return u_strndup(line + 1, i - 1); +} + +/** + * Return a new allocated unicode string describing the part of speech. + * Set *ptr to the next unread character. + * Assumes that (*ptr)[0] == '.'. + */ +unichar* ConfigCommand::tokenize_part_of_speech(unichar** ptr, + const char* config_filename) { + unichar* line = *ptr; + // try to read ..copy + if (line[0] == '.' && line[1] == '.') { + if (!u_starts_with(line + 1, DOT_COPY)) { + fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, + line); + } + *ptr = line + 1 + strlen(DOT_COPY); + if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { + fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, + line); + } + return u_strdup(DOT_COPY); + } + int i = 1; + while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { + i++; + } + *ptr = line + i; + return u_strndup(line + 1, i - 1); +} + +/** + * Return a new allocated unicode string describing a semantic code. + * Set *ptr to the next unread character. + * Assumes that (*ptr)[0] == '+'. + */ +unichar* ConfigCommand::tokenize_one_semantic_code( + unichar** ptr, const char* config_filename) { + unichar* line = *ptr; + // try to read ++copy + if (line[0] == '+' && line[1] == '+') { + if (!u_starts_with(line + 1, PLUS_COPY)) { + fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, + line); + } + *ptr = line + 1 + strlen(PLUS_COPY); + if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { + fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, + line); + } + return u_strdup(PLUS_COPY); + } + int i = 1; + while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { + i++; + } + *ptr = line + i; + return u_strndup(line + 1, i - 1); +} + +/** + * Return a new allocated list of semantic codes. + * Set *ptr to the next unread character. + * Assumes that (*ptr)[0] == '+'. + */ +struct list_ustring* ConfigCommand::tokenize_semantic_codes( + unichar** ptr, const char* config_filename) { + unichar* line = *ptr; + unichar* next_code = line; + struct list_ustring* codes = nullptr; + while (*next_code != '\0' && *next_code != ':') { + if (*next_code == '+') { + unichar* new_code = + tokenize_one_semantic_code(&next_code, config_filename); + if (codes == nullptr) { + codes = new_list_ustring(new_code); + } else if (!is_in_list(new_code, codes)) { + insert_at_end_of_list(new_code, codes); + } + free(new_code); + } + } + *ptr = next_code; + return codes; +} + +/** + * Return a new allocated unicode string describing an inflectional code. + * Set *ptr to the next unread character. + * Assumes that (*ptr)[0] == ':'. + */ +unichar* ConfigCommand::tokenize_one_inflectional_code( + unichar** ptr, const char* config_filename) { + unichar* line = *ptr; + // try to read ::copy + if (line[0] == ':' && line[1] == ':') { + if (!u_starts_with(line + 1, COLUMN_COPY)) { + fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, + line); + } + *ptr = line + 1 + strlen(COLUMN_COPY); + if (**ptr != '\0' && **ptr != ':') { + fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, + line); + } + return u_strdup(COLUMN_COPY); + } + int i = 1; + while (line[i] != '\0' && line[i] != ':') { + i++; + } + *ptr = line + i; + return u_strndup(line + 1, i - 1); +} + +/** + * Return a new allocated list of inflectional codes. + * Assumes that str[0] == ':'. + */ +struct list_ustring* ConfigCommand::tokenize_inflectional_codes( + unichar* str, const char* config_filename) { + struct list_ustring* codes = nullptr; + unichar* next_code = str; + while (*next_code != '\0') { + if (*next_code == ':') { + unichar* new_code = + tokenize_one_inflectional_code(&next_code, config_filename); + if (codes == nullptr) { + codes = new_list_ustring(new_code); + } else if (!is_in_list(new_code, codes)) { + insert_at_end_of_list(new_code, codes); + } + free(new_code); + } + } + return codes; +} + +/*================================================================= + * ConfigLine class method + *================================================================= */ + +/** + * Tokenize a config line. + * Returns a std::unique_ptr if there is a well-formed line. + * otherwise returns nullptr. + * Raises a fatal error in case of malformed line. + */ +std::unique_ptr ConfigLine::tokenize_config_line( + unichar* line, const char* config_filename) { + unichar* nextNoEmptyUnichar = line; + unichar patternToken[INPUTSIZEBUFFER] = {0}; + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + return nullptr; // skip empty line + } + if (nextNoEmptyUnichar[0] == '#') { + return nullptr; // skip comment line + } + if (recognize_pattern_token(&nextNoEmptyUnichar, patternToken)) { + fatal_error( + "Lexical mask must be enclosed in < >, like in file: %s, " + "line: '%S'\n", + config_filename, line); + } + // build pattern + struct pattern* pattern = build_pattern(patternToken, nullptr, 0, nullptr); + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + // if there is no command, we skip the current line like comment line + return nullptr; + } + // build nb_required_tag + int nb_required_tag = NOT_SPECIFIED; + if (nextNoEmptyUnichar[0] == '{') { + if (tokenize_nb_required_tag(&nextNoEmptyUnichar, &nb_required_tag)) { + fatal_error("Braces must contain number in file: %s, line: '%S'\n", + config_filename, line); + } + } + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + // if there is no command, we skip the current line like comment line + return nullptr; + } + // build config command + auto config_commang = ConfigCommand::tokenize_config_command( + nextNoEmptyUnichar, config_filename); + if (u_strcmp(config_commang->get_lemma(), ConfigCommand::COMMA_COPY) == 0 && + nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { + fatal_error( + "Command ,,copy is incompatible with an integer enclosed in curly " + "braces, except for {1} in file: %s, line: '%S'\n", + config_filename, line); + } + if (u_strcmp(config_commang->get_part_of_speech(), ConfigCommand::DOT_COPY) == + 0 && + nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { + fatal_error( + "Command ..copy is incompatible with an integer enclosed in curly " + "braces, except for {1}in file:%s, line: '%S'\n", + config_filename, line); + } + return std::make_unique(pattern, nb_required_tag, + std::move(config_commang)); +} + +ConfigLine::ConfigLine(struct pattern* pattern, int nb_required_tag, + std::shared_ptr config_command) + : _pattern{pattern}, + _nb_required_tag{nb_required_tag}, + _config_command{std::move(config_command)} { +} + +ConfigLine::~ConfigLine() { + free_pattern(_pattern); +} + +/** + * Set *str to the next no blank character. + * Assumes that *str ended with '\0'. + * Returns 1 if *str ends with blank char, otherwise returns 0. + */ +int ConfigLine::advance_to_next_no_blank_char(unichar** str) { + int i = 0; + while ((*str)[i] != '\0' && ((*str)[i] == ' ' || (*str)[i] == '\t')) { + i++; + } + if ((*str)[i] == '\0') { + return 1; + } + *str = *str + i; + return 0; +} + +/** + * Return 1 if error occurs, otherwise 0. + * Assumes that pattern is suround by < >. + * Assumes that empty space are removed from the begin, + * using ConfigLine::advances_to_next_no_blank_char. + */ +int ConfigLine::recognize_pattern_token(unichar** ptr, unichar* res) { + unichar* line = *ptr; + int index_line = 0; + int index_res = 0; + if (line[index_line] != '<') { + return 1; + } + index_line++; + while (line[index_line] != '\0') { + if (line[index_line] != '>') { + res[index_res] = line[index_line]; + index_res++; + index_line++; + } else { + res[index_res] = '\0'; + *ptr = line + index_line + 1; + return 0; + } + } + return 1; +} + +/** + * Return 1 if error occurs, otherwise 0. + * Assumes that empty space are removed from the begin, + * using ConfigLine::advances_to_next_no_blank_char. + */ +int ConfigLine::tokenize_nb_required_tag(unichar** ptr, int* res) { + unichar* line = *ptr; + int index = 0; + int index_buffer = 0; + unichar buffer[INPUTSIZEBUFFER] = {0}; + if (line[0] != '{') { + return 1; + } + index++; + while (line[index] != '\0') { + if (line[index] != '}') { + if (line[index] < '0' || line[index] > '9') { + return 1; + } + buffer[index_buffer] = line[index]; + index++; + index_buffer++; + } else { + if (index == 1) { + // {} is a syntax error + return 1; + } + res[index_buffer] = '\0'; + *res = u_parse_int(buffer, nullptr); + *ptr = line + index + 1; + return 0; + } + } + return 1; +} + + +/*================================================================= + * Multi2Delaf class method + *================================================================= */ + +Multi2Delaf::Multi2Delaf(const char* config_filename) + : _config_filename{config_filename} { +} + +/** + * Read the configuration file that specifies how to transcode the + * multidelaf string into a delaf tag. + * Raises a fatal error in case of malformed file. + */ +void Multi2Delaf::parse_config_file() { + U_FILE* config_file = u_fopen(&_vec, _config_filename, U_READ); + if (config_file == nullptr) { + fatal_error("Cannot open configuration file %s\n", _config_filename); + } + load_config_file(config_file); + u_fclose(config_file); +} + +/** + * Read a line from the configuration file, and save the result in buffer. + * Returns EOF if end of file occurs, otherwise the number of unichar readed. + */ +int Multi2Delaf::read_line_config_file(U_FILE* config_file, unichar* buffer, + int size_buffer) { + int c = 0; + int i = 0; + while (i < size_buffer - 1 && EOF != (c = u_fgetc(config_file))) { + if (c == '\r') { + if ('\n' == (c = u_fgetc(config_file))) { + buffer[i] = '\0'; + return i; + } + unichar rest_of_line[INPUTSIZEBUFFER] = {0}; + int j = 0; + while (j < INPUTSIZEBUFFER - 1 && EOF != (c = u_fgetc(config_file))) { + if (c != '\n') { + rest_of_line[j] = c; + j++; + } else { + rest_of_line[j] = '\0'; + u_fprintf(U_STDERR, "\\r not followed by a \\n, '%S' is ignored\n", + rest_of_line); + return i + j; + } + } + } + if (c == '\n') { + buffer[i] = '\0'; + return i; + } + buffer[i] = c; + i++; + } + buffer[i] = '\0'; + return EOF; +} + +/** + * Load the configuration file. + * Raises a fatal error in case of malformed file. + */ +void Multi2Delaf::load_config_file(U_FILE* config_file) { + unichar line[INPUTSIZEBUFFER] = {0}; + int eof = 0; + while (EOF != + (eof = read_line_config_file(config_file, line, INPUTSIZEBUFFER))) { + auto config_line = ConfigLine::tokenize_config_line( + line, filename_without_path(_config_filename)); + if (config_line != nullptr) { + _config_lines.emplace_back(std::move(config_line)); + } + } + // the last line is potentially a config line + auto config_line = ConfigLine::tokenize_config_line( + line, filename_without_path(_config_filename)); + if (config_line != nullptr) { + _config_lines.emplace_back(std::move(config_line)); + } +} + + +} // namespace unitex diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h new file mode 100644 index 00000000..82def9af --- /dev/null +++ b/src/Multi2Delaf.h @@ -0,0 +1,139 @@ +/** + * Unitex + * + * Copyright (C) 2001-2021 Université Paris-Est Marne-la-Vallée + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef MULTI2DELAFH +#define MULTI2DELAFH + +#ifndef HAS_UNITEX_NAMESPACE +#define HAS_UNITEX_NAMESPACE 1 +#endif + +#include +#include + +#include "Pattern.h" +#include "Unicode.h" + +namespace unitex { + +/** + * Describes a command in the file that configures the parser for multidelaf strings. + * A multidelaf string is an output of Explore graph paths made of a sequence of + * Delaf tags. + */ +class ConfigCommand { + public: + static constexpr const char* const COMMA_COPY = ",copy"; + static constexpr const char* const DOT_COPY = ".copy"; + static constexpr const char* const PLUS_COPY = "+copy"; + static constexpr const char* const COLUMN_COPY = ":copy"; + + static std::unique_ptr tokenize_config_command( + unichar* str, const char* config_filename); + + ConfigCommand(unichar* lemma, unichar* part_of_speech, + struct list_ustring* semantic_codes, + struct list_ustring* inflectional_codes); + // Not copyable or movable + ConfigCommand(const ConfigCommand&) = delete; + ConfigCommand(ConfigCommand&&) = delete; + ConfigCommand& operator=(const ConfigCommand& other) = delete; + ConfigCommand& operator=(ConfigCommand&& other) = delete; + ~ConfigCommand(); + unichar* get_lemma() const; + unichar* get_part_of_speech() const; + + private: + static unichar* tokenize_lemma(unichar** ptr, const char* config_filename); + static unichar* tokenize_part_of_speech(unichar** ptr, + const char* config_filename); + static unichar* tokenize_one_semantic_code(unichar** ptr, + const char* config_filename); + static struct list_ustring* tokenize_semantic_codes( + unichar** ptr, const char* config_filename); + static unichar* tokenize_one_inflectional_code(unichar** ptr, + const char* config_filename); + static struct list_ustring* tokenize_inflectional_codes( + unichar* str, const char* config_filename); + unichar* _lemma; + unichar* _part_of_speech; + struct list_ustring* _semantic_codes; + struct list_ustring* _inflectional_codes; +}; + +/** + * Describes a line in the file that configures the parser for multidelaf strings. + * A multidelaf string is an output of Explore graph paths made of a sequence of + * Delaf tags. + */ +class ConfigLine { + public: + static constexpr int NOT_SPECIFIED = -1; // nb_required_tag default value + + static std::unique_ptr tokenize_config_line( + unichar* line, const char* config_filename); + + ConfigLine(struct pattern* pattern, int nb_required_tag, + std::shared_ptr _config_command); + // Not copyable or movable + ConfigLine(const ConfigLine&) = delete; + ConfigLine(ConfigLine&&) = delete; + ConfigLine& operator=(const ConfigLine& other) = delete; + ConfigLine& operator=(ConfigLine&& other) = delete; + ~ConfigLine(); + static int advance_to_next_no_blank_char(unichar** str); + + private: + static int recognize_pattern_token(unichar** ptr, unichar* res); + static int tokenize_nb_required_tag(unichar** ptr, int* res); + struct pattern* _pattern; // lexical mask + // if _nb_required_tag equals to NOT_SPECIFIED, the number of delaf tags that must match the pattern is not specified + // else if _nb_required_tag equals to 0, the pattern must not match any delaf tag + // otherwise the numer of delaf tag that must match the pattern + const int _nb_required_tag; + const std::shared_ptr _config_command; +}; + +/** + * Multi2Delaf class + */ +class Multi2Delaf { + public: + Multi2Delaf(const char* config_filename); + // Not copyable or movable + Multi2Delaf(const Multi2Delaf&) = delete; + Multi2Delaf(Multi2Delaf&&) = delete; + Multi2Delaf& operator=(const Multi2Delaf& other) = delete; + Multi2Delaf& operator=(Multi2Delaf&& other) = delete; + void parse_config_file(); + + private: + static int read_line_config_file(U_FILE* config_file, unichar* buffer, + int size_buffer); + void load_config_file(U_FILE* config_file); + const VersatileEncodingConfig _vec = VEC_DEFAULT; + std::vector> _config_lines; + const char* _config_filename; +}; + +} // namespace unitex + +#endif From 8d7d616f9e0beb7706be3b4e28af7bc724840d63 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Fri, 15 Jul 2022 18:57:45 +0200 Subject: [PATCH 03/24] [feature/multi2delaf] Add the translate function --- src/Multi2Delaf.cpp | 385 ++++++++++++++++++++++++++++++++++++++++++++ src/Multi2Delaf.h | 24 +++ 2 files changed, 409 insertions(+) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 9bb95722..1ddcf4a6 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -21,10 +21,12 @@ #include "Multi2Delaf.h" +#include #include #include #include +#include "DELA.h" #include "File.h" #include "Pattern.h" #include "Unicode.h" @@ -106,6 +108,14 @@ unichar* ConfigCommand::get_part_of_speech() const { return _part_of_speech; } +struct list_ustring* ConfigCommand::get_semantic_codes() const { + return _semantic_codes; +} + +struct list_ustring* ConfigCommand::get_inflectional_codes() const { + return _inflectional_codes; +} + /** * Return a new allocated unicode string describing the lemma. * Set *ptr to the next unread character. @@ -349,6 +359,18 @@ ConfigLine::~ConfigLine() { free_pattern(_pattern); } +struct pattern* ConfigLine::get_pattern() const { + return _pattern; +} + +int ConfigLine::get_nb_required_tag() const { + return _nb_required_tag; +} + +std::shared_ptr ConfigLine::get_config_command() const { + return _config_command; +} + /** * Set *str to the next no blank character. * Assumes that *str ended with '\0'. @@ -453,6 +475,43 @@ void Multi2Delaf::parse_config_file() { u_fclose(config_file); } +/** + * Translate a multidelaf string to a delaf tag using the config file. + * Make the translation in place in the buffer. + * Supposes that the config file is already readed. + */ +void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, + unichar* buffer) const { + unichar* ptr = buffer; + auto delaf_tags = std::vector(); + struct dela_entry* new_tag = nullptr; + while (nullptr != (new_tag = tokenize_delaf_tag(&ptr))) { + delaf_tags.push_back(new_tag); + } + unichar* inflected = escape_inflected_input(inflected_input); + unichar* lemma = retrieve_lemma(delaf_tags, buffer); + unichar* part_of_speech = retrieve_part_of_speech(delaf_tags, buffer); + unichar* semantic_codes = retrieve_semantic_codes(delaf_tags); + unichar* inflectional_codes = retrieve_inflectional_codes(delaf_tags); + if (u_strlen(inflected) + u_strlen(lemma) + u_strlen(part_of_speech) + + u_strlen(semantic_codes) + u_strlen(inflectional_codes) + 2 >= + INPUTSIZEBUFFER) { + fatal_error( + "internal err(Multi2Delaf::translate_multidelaf_to_delaf): buffer is " + "not big enough\n"); + } + u_sprintf(buffer, "%S,%S.%S%S%S", inflected, lemma, part_of_speech, + semantic_codes, inflectional_codes); + free(lemma); + free(inflected); + free(part_of_speech); + free(semantic_codes); + free(inflectional_codes); + for (const auto& tag : delaf_tags) { + free_dela_entry(tag); + } +} + /** * Read a line from the configuration file, and save the result in buffer. * Returns EOF if end of file occurs, otherwise the number of unichar readed. @@ -515,5 +574,331 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { } } +/** + * Return the first delaf tag at the address *ptr. + * Return nullptr to indicate that there is no more tag. + * Set *ptr to the next unread character. + * Raises a fatal error if the delaf tag is not enclosed in curly braces. + */ +struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { + unichar* line = *ptr; + unichar* next_no_blank_char = line; + if (ConfigLine::advance_to_next_no_blank_char(&next_no_blank_char)) { + return nullptr; // end of the line, no more dela_entry + } + if (next_no_blank_char[0] != '{') { + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n '", + line); + } + int i = 1; + while (next_no_blank_char[i] != '\0' && next_no_blank_char[i] != '}') { + i++; + } + if (next_no_blank_char[i] == '\0') { + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n '", + line); + } + unichar* token_dela_entry = u_strndup(line + 1, i - 1); + struct dela_entry* tag = tokenize_DELAF_line(token_dela_entry); + free(token_dela_entry); + *ptr = line + i + 1; // + 1 to skip closing brace '}' + return tag; +} + +/** + * Return the number of delaf tags that match the pattern. + */ +int Multi2Delaf::nb_delaf_tag_that_match_pattern( + const std::vector& delaf_tags, + const struct pattern* pattern) { + return std::count_if(delaf_tags.begin(), delaf_tags.end(), + [&pattern](const auto& tag) { + return is_entry_compatible_with_pattern(tag, pattern); + }); +} + +/** + * Return a new allocated unicode string. + */ +unichar* Multi2Delaf::escape_inflected_input(const unichar* input) { + unichar buffer[INPUTSIZEBUFFER] = {0}; + int i_input = 0; + int i_buffer = 0; + while (i_input < INPUTSIZEBUFFER - 1 && i_buffer < INPUTSIZEBUFFER - 1) { + if (input[i_input] == ' ' && input[i_input + 1] == '\0') { + buffer[i_buffer] = '\0'; + return u_strdup(buffer); + } + if (input[i_input] == '=' || input[i_input] == '.' || + input[i_input] == ',') { + buffer[i_buffer] = '\\'; + i_buffer++; + } + buffer[i_buffer] = input[i_input]; + i_buffer++; + i_input++; + } + buffer[std::min(i_buffer, INPUTSIZEBUFFER - 1)] = '\0'; + return u_strdup(buffer); +} + +/** + * Retrieve the lemma according to the specification: + * We look every line of _config_lines and keep the lemma of the first-one matching containing a lemma. + * If no lemma is corresponding, raises a fatal_error(). + * Return a new allocated unicode string describing the lemma. + */ +unichar* Multi2Delaf::retrieve_lemma( + const std::vector& delaf_tags, + const unichar* multidelaf_string) const { + for (const auto& current_line : _config_lines) { + if (current_line->get_config_command()->get_lemma() == nullptr) { + continue; + } + for (const auto& tag : delaf_tags) { + if (is_entry_compatible_with_pattern(tag, current_line->get_pattern())) { + if (u_strcmp(current_line->get_config_command()->get_lemma(), + ConfigCommand::COMMA_COPY) == 0) { + if (nb_delaf_tag_that_match_pattern( + delaf_tags, current_line->get_pattern()) != 1) { + fatal_error( + "Command ,,copy can be interpreted for several delaf line: " + "%S\n", + multidelaf_string); + } + return u_strdup(tag->lemma); + } + if (current_line->get_nb_required_tag() == 1 || + current_line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED) { + return u_strdup(current_line->get_config_command()->get_lemma()); + } + if (current_line->get_nb_required_tag() == + nb_delaf_tag_that_match_pattern(delaf_tags, + current_line->get_pattern())) { + return u_strdup(current_line->get_config_command()->get_lemma()); + } + } + } + if (current_line->get_nb_required_tag() == 0) { + return u_strdup(current_line->get_config_command()->get_lemma()); + } + } + fatal_error("No lemma is provided for this multidelaf string: %S\n", + multidelaf_string); + return nullptr; +} + +/** + * Retrieves the part_of_speech according to the specification: + * We look every line of config lines and keep the part_of_speech of the first-one matching containing a part_of_speech. + * If no part_of_speech is corresponding, raises a fatal_error(). + * Return a new allocated unicode string describing the part of speech. + */ +unichar* Multi2Delaf::retrieve_part_of_speech( + const std::vector& delaf_tags, + const unichar* multidelaf_string) const { + for (const auto& current_line : _config_lines) { + if (current_line->get_config_command()->get_part_of_speech() == nullptr) { + continue; + } + for (const auto& tag : delaf_tags) { + if (is_entry_compatible_with_pattern(tag, current_line->get_pattern())) { + if (u_strcmp(current_line->get_config_command()->get_part_of_speech(), + ConfigCommand::DOT_COPY) == 0) { + if (nb_delaf_tag_that_match_pattern( + delaf_tags, current_line->get_pattern()) != 1) { + fatal_error( + "Command ..copy can be interpreted for several delaf tag: %S\n", + multidelaf_string); + } + return u_strdup(tag->semantic_codes[0]); + } + if (current_line->get_nb_required_tag() == 1 || + current_line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED) { + return u_strdup( + current_line->get_config_command()->get_part_of_speech()); + } + if (current_line->get_nb_required_tag() == + nb_delaf_tag_that_match_pattern(delaf_tags, + current_line->get_pattern())) { + return u_strdup( + current_line->get_config_command()->get_part_of_speech()); + } + } + } + if (current_line->get_nb_required_tag() == 0) { + return u_strdup(current_line->get_config_command()->get_part_of_speech()); + } + } + fatal_error( + "No grammatical cathegory is provided for this multidelaf string: %S\n", + multidelaf_string); + return nullptr; +} + +/** + * Retrieves semantic codes according to the specification: + * The semantic codes of the multidelaf string are obtained by taking the union + * of the semantic codes of each tag assigned by the configuration file. + * If no semantic_codes is corresponding, return an empty string. + * Return a new allocated unicode string describing semantic codes. + */ +unichar* Multi2Delaf::retrieve_semantic_codes( + const std::vector& delaf_tags) const { + struct list_ustring* codes = nullptr; + struct list_ustring* ptr_command = nullptr; + + for (const auto& tag : delaf_tags) { + for (const auto& line : _config_lines) { + if (line->get_config_command()->get_semantic_codes() == nullptr) { + continue; + } + if (is_entry_compatible_with_pattern(tag, line->get_pattern())) { + ptr_command = line->get_config_command()->get_semantic_codes(); + while (ptr_command != nullptr) { + if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || + line->get_nb_required_tag() == + nb_delaf_tag_that_match_pattern(delaf_tags, + line->get_pattern())) { + if (u_strcmp(ptr_command->string, ConfigCommand::PLUS_COPY) == 0) { + for (int i = 1; i < tag->n_semantic_codes; + i++) { // begin at 1 to skip the grammatical catergory + if (codes == nullptr) { + codes = new_list_ustring(tag->semantic_codes[i]); + } else if (!is_in_list(tag->semantic_codes[i], codes)) { + insert_at_end_of_list(tag->semantic_codes[i], codes); + } + } + } else { + if (line->get_nb_required_tag() != 0) { + if (codes == nullptr) { + codes = new_list_ustring(ptr_command->string); + } else if (!is_in_list(ptr_command->string, codes)) { + insert_at_end_of_list(ptr_command->string, codes); + } + } + } + } + ptr_command = ptr_command->next; + } + } else { + if (0 == line->get_nb_required_tag() && + 0 == nb_delaf_tag_that_match_pattern(delaf_tags, + line->get_pattern())) { + struct list_ustring* ptr_command = + line->get_config_command()->get_semantic_codes(); + while (ptr_command != nullptr) { + if (codes == nullptr) { + codes = new_list_ustring(ptr_command->string); + } else { + if (!is_in_list(ptr_command->string, codes)) { + insert_at_end_of_list(ptr_command->string, codes); + } + } + ptr_command = ptr_command->next; + } + } + } + } + } + unichar* res = build_output_codes(codes, '+'); + if (codes) { + free_list_ustring(codes); + } + return res; +} + +/** + * Retrieves inflectional codes according to the specification: + * The inflectional codes of the multidelaf string are obtained by taking the union + * of the inflectional codes of each tag assigned by the configuration file. + * If no inflectional_codes is corresponding, return an empty string. + * Return a new allocated unicode string describing inflectional codes. + */ +unichar* Multi2Delaf::retrieve_inflectional_codes( + const std::vector& delaf_tags) const { + struct list_ustring* codes = nullptr; + struct list_ustring* ptr_command = nullptr; + + for (const auto& tag : delaf_tags) { + for (const auto& line : _config_lines) { + if (line->get_config_command()->get_inflectional_codes() == nullptr) { + continue; + } + if (is_entry_compatible_with_pattern(tag, line->get_pattern())) { + ptr_command = line->get_config_command()->get_inflectional_codes(); + while (ptr_command != nullptr) { + if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || + line->get_nb_required_tag() == + nb_delaf_tag_that_match_pattern(delaf_tags, + line->get_pattern())) { + if (u_strcmp(ptr_command->string, ConfigCommand::COLUMN_COPY) == + 0) { + for (int i = 0; i < tag->n_inflectional_codes; i++) { + if (codes == nullptr) { + codes = new_list_ustring(tag->inflectional_codes[i]); + } else if (!is_in_list(tag->inflectional_codes[i], codes)) { + insert_at_end_of_list(tag->inflectional_codes[i], codes); + } + } + } else { + if (line->get_nb_required_tag() != 0) { + if (codes == nullptr) { + codes = new_list_ustring(ptr_command->string); + } else if (!is_in_list(ptr_command->string, codes)) { + insert_at_end_of_list(ptr_command->string, codes); + } + } + } + } + ptr_command = ptr_command->next; + } + } else { + if (0 == line->get_nb_required_tag() && + 0 == nb_delaf_tag_that_match_pattern(delaf_tags, + line->get_pattern())) { + struct list_ustring* ptr_command = + line->get_config_command()->get_inflectional_codes(); + while (ptr_command != nullptr) { + if (codes == nullptr) { + codes = new_list_ustring(ptr_command->string); + } else { + if (!is_in_list(ptr_command->string, codes)) { + insert_at_end_of_list(ptr_command->string, codes); + } + } + ptr_command = ptr_command->next; + } + } + } + } + } + unichar* res = build_output_codes(codes, ':'); + if (codes) { + free_list_ustring(codes); + } + return res; +} + +/** + * Return a new allocated unicode string. + */ +unichar* Multi2Delaf::build_output_codes(const struct list_ustring* codes, + char prefix) { + unichar buffer[INPUTSIZEBUFFER] = {0}; + if (codes == nullptr) { + return u_strdup(""); + } + while (codes != nullptr) { + if (u_strlen(buffer) + u_strlen(codes->string) + 2 >= INPUTSIZEBUFFER - 1) { + fatal_error( + "internal err(Multi2Delaf::build_output_codes): buffer is not " + "big enough\n"); + } + u_sprintf(buffer, "%S%c%S", buffer, prefix, codes->string); + codes = codes->next; + } + return u_strdup(buffer); +} } // namespace unitex diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 82def9af..08151583 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -29,6 +29,7 @@ #include #include +#include "DELA.h" #include "Pattern.h" #include "Unicode.h" @@ -60,6 +61,8 @@ class ConfigCommand { ~ConfigCommand(); unichar* get_lemma() const; unichar* get_part_of_speech() const; + struct list_ustring* get_semantic_codes() const; + struct list_ustring* get_inflectional_codes() const; private: static unichar* tokenize_lemma(unichar** ptr, const char* config_filename); @@ -100,6 +103,9 @@ class ConfigLine { ConfigLine& operator=(ConfigLine&& other) = delete; ~ConfigLine(); static int advance_to_next_no_blank_char(unichar** str); + struct pattern* get_pattern() const; + int get_nb_required_tag() const; + std::shared_ptr get_config_command() const; private: static int recognize_pattern_token(unichar** ptr, unichar* res); @@ -124,11 +130,29 @@ class Multi2Delaf { Multi2Delaf& operator=(const Multi2Delaf& other) = delete; Multi2Delaf& operator=(Multi2Delaf&& other) = delete; void parse_config_file(); + void translate_multidelaf_to_delaf(const unichar* inflected_input, + unichar* buffer) const; private: static int read_line_config_file(U_FILE* config_file, unichar* buffer, int size_buffer); void load_config_file(U_FILE* config_file); + static struct dela_entry* tokenize_delaf_tag(unichar** next); + static int nb_delaf_tag_that_match_pattern( + const std::vector& delaf_tags, + const struct pattern* pattern); + static unichar* escape_inflected_input(const unichar* input); + unichar* retrieve_lemma(const std::vector& delaf_tags, + const unichar* multidelaf_string) const; + unichar* retrieve_part_of_speech( + const std::vector& delaf_tags, + const unichar* multidelaf_string) const; + unichar* retrieve_semantic_codes( + const std::vector& delaf_tags) const; + unichar* retrieve_inflectional_codes( + const std::vector& delaf_tags) const; + static unichar* build_output_codes(const struct list_ustring* list, + char separator); const VersatileEncodingConfig _vec = VEC_DEFAULT; std::vector> _config_lines; const char* _config_filename; From f4ed6e29c1ba62643bfd4704b100298cd0f51510 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Wed, 20 Jul 2022 15:25:13 +0200 Subject: [PATCH 04/24] Resolve some mistakes --- src/Multi2Delaf.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 1ddcf4a6..64ac7789 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -127,12 +127,12 @@ unichar* ConfigCommand::tokenize_lemma(unichar** ptr, // try to read ,,copy if (line[0] == ',' && line[1] == ',') { if (!u_starts_with(line + 1, COMMA_COPY)) { - fatal_error("Double ',' in file: %s, line: `%S`\n", config_filename, + fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } *ptr = line + 1 + strlen(COMMA_COPY); if (**ptr != '\0' && **ptr != '.' && **ptr != '+' && **ptr != ':') { - fatal_error("Double ',' in file: %s, line: `%S`\n", config_filename, + fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } return u_strdup(COMMA_COPY); @@ -327,16 +327,16 @@ std::unique_ptr ConfigLine::tokenize_config_line( return nullptr; } // build config command - auto config_commang = ConfigCommand::tokenize_config_command( + auto config_command = ConfigCommand::tokenize_config_command( nextNoEmptyUnichar, config_filename); - if (u_strcmp(config_commang->get_lemma(), ConfigCommand::COMMA_COPY) == 0 && + if (u_strcmp(config_command->get_lemma(), ConfigCommand::COMMA_COPY) == 0 && nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { fatal_error( "Command ,,copy is incompatible with an integer enclosed in curly " "braces, except for {1} in file: %s, line: '%S'\n", config_filename, line); } - if (u_strcmp(config_commang->get_part_of_speech(), ConfigCommand::DOT_COPY) == + if (u_strcmp(config_command->get_part_of_speech(), ConfigCommand::DOT_COPY) == 0 && nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { fatal_error( @@ -345,7 +345,7 @@ std::unique_ptr ConfigLine::tokenize_config_line( config_filename, line); } return std::make_unique(pattern, nb_required_tag, - std::move(config_commang)); + std::move(config_command)); } ConfigLine::ConfigLine(struct pattern* pattern, int nb_required_tag, @@ -587,7 +587,7 @@ struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { return nullptr; // end of the line, no more dela_entry } if (next_no_blank_char[0] != '{') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n '", + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", line); } int i = 1; @@ -595,13 +595,13 @@ struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { i++; } if (next_no_blank_char[i] == '\0') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n '", + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", line); } - unichar* token_dela_entry = u_strndup(line + 1, i - 1); + unichar* token_dela_entry = u_strndup(next_no_blank_char + 1, i - 1); struct dela_entry* tag = tokenize_DELAF_line(token_dela_entry); free(token_dela_entry); - *ptr = line + i + 1; // + 1 to skip closing brace '}' + *ptr = next_no_blank_char + i + 1; // + 1 to skip closing brace '}' return tag; } From 16cbf04c758dad69d4991ef92f3109d2a78a6371 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Fri, 22 Jul 2022 16:00:45 +0200 Subject: [PATCH 05/24] Update of the algorithm that finds inflectional codes --- src/Multi2Delaf.cpp | 131 +++++++++++++++++++++++++++++++++----------- src/Multi2Delaf.h | 7 +++ 2 files changed, 107 insertions(+), 31 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 64ac7789..6e382d8a 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -808,6 +808,80 @@ unichar* Multi2Delaf::retrieve_semantic_codes( return res; } +/** + * Returns a new string containing the first one and the second one without duplicates. + */ +unichar* Multi2Delaf::complete_first_with_second(const unichar* first, + const unichar* second) { + unichar to_add[INPUTSIZEBUFFER] = {0}; + unichar res[INPUTSIZEBUFFER] = {0}; + int j = 0; + for (size_t i = 0; i < u_strlen(second); i++) { + if (u_strchr(first, second[i]) == nullptr) { + to_add[j] = second[i]; + j++; + } + } + to_add[j] = '\0'; + u_sprintf(res, "%S%S", first, to_add); + return u_strdup(res); +} + +/** + * Returns a new list where ::copy has been replaced by the inflectional codes of the tag. + */ +struct list_ustring* Multi2Delaf::clone_and_replace_copy_command( + const struct list_ustring* inflectional_command, + const struct dela_entry* tag) { + struct list_ustring* res = nullptr; + + while (inflectional_command != nullptr) { + if (u_strcmp(inflectional_command->string, ConfigCommand::COLUMN_COPY) != + 0) { + if (!is_in_list(inflectional_command->string, res)) { + res = insert_at_end_of_list(inflectional_command->string, res); + } + } else { + for (int i = 0; i < tag->n_inflectional_codes; i++) { + if (!is_in_list(tag->inflectional_codes[i], res)) { + res = insert_at_end_of_list(tag->inflectional_codes[i], res); + } + } + } + inflectional_command = inflectional_command->next; + } + return res; +} + +/** + * Create a new allocated list containing the Cartesian product of the two lists in parameter + * and substituate ::copy command by codes in the delaf tag. + * Suppose that l2 is not the empty list. + */ +struct list_ustring* Multi2Delaf::product(struct list_ustring* l1, + struct list_ustring* l2) { + unichar* tmp_code = nullptr; + struct list_ustring* res = nullptr; + struct list_ustring* ptr_l1 = l1; + struct list_ustring* ptr_l2 = l2; + if (l1 == nullptr) { + return clone(l2); + } + while (ptr_l1 != nullptr) { + ptr_l2 = l2; + while (ptr_l2 != nullptr) { + tmp_code = complete_first_with_second(ptr_l1->string, ptr_l2->string); + if (!is_in_list(tmp_code, res)) { + res = insert_at_end_of_list(tmp_code, res); + } + free(tmp_code); + ptr_l2 = ptr_l2->next; + } + ptr_l1 = ptr_l1->next; + } + return res; +} + /** * Retrieves inflectional codes according to the specification: * The inflectional codes of the multidelaf string are obtained by taking the union @@ -818,56 +892,51 @@ unichar* Multi2Delaf::retrieve_semantic_codes( unichar* Multi2Delaf::retrieve_inflectional_codes( const std::vector& delaf_tags) const { struct list_ustring* codes = nullptr; + struct list_ustring* tmp_codes = nullptr; struct list_ustring* ptr_command = nullptr; for (const auto& tag : delaf_tags) { for (const auto& line : _config_lines) { + tmp_codes = codes; if (line->get_config_command()->get_inflectional_codes() == nullptr) { continue; } if (is_entry_compatible_with_pattern(tag, line->get_pattern())) { ptr_command = line->get_config_command()->get_inflectional_codes(); - while (ptr_command != nullptr) { - if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || - line->get_nb_required_tag() == - nb_delaf_tag_that_match_pattern(delaf_tags, - line->get_pattern())) { - if (u_strcmp(ptr_command->string, ConfigCommand::COLUMN_COPY) == - 0) { - for (int i = 0; i < tag->n_inflectional_codes; i++) { - if (codes == nullptr) { - codes = new_list_ustring(tag->inflectional_codes[i]); - } else if (!is_in_list(tag->inflectional_codes[i], codes)) { - insert_at_end_of_list(tag->inflectional_codes[i], codes); - } - } + + if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || + line->get_nb_required_tag() == + nb_delaf_tag_that_match_pattern(delaf_tags, + line->get_pattern())) { + if (is_in_list(ConfigCommand::COLUMN_COPY, ptr_command)) { + struct list_ustring* tmp_lst = + clone_and_replace_copy_command(ptr_command, tag); + if (tmp_lst == nullptr) { + codes = product(nullptr, tmp_codes); } else { - if (line->get_nb_required_tag() != 0) { - if (codes == nullptr) { - codes = new_list_ustring(ptr_command->string); - } else if (!is_in_list(ptr_command->string, codes)) { - insert_at_end_of_list(ptr_command->string, codes); - } - } + codes = product(tmp_codes, tmp_lst); + free_list_ustring(tmp_lst); + } + if (tmp_codes) { + free_list_ustring(tmp_codes); + } + } else { + codes = product(tmp_codes, ptr_command); + if (tmp_codes) { + free_list_ustring(tmp_codes); } } - ptr_command = ptr_command->next; } + } else { if (0 == line->get_nb_required_tag() && 0 == nb_delaf_tag_that_match_pattern(delaf_tags, line->get_pattern())) { struct list_ustring* ptr_command = line->get_config_command()->get_inflectional_codes(); - while (ptr_command != nullptr) { - if (codes == nullptr) { - codes = new_list_ustring(ptr_command->string); - } else { - if (!is_in_list(ptr_command->string, codes)) { - insert_at_end_of_list(ptr_command->string, codes); - } - } - ptr_command = ptr_command->next; + codes = product(tmp_codes, ptr_command); + if (tmp_codes) { + free_list_ustring(tmp_codes); } } } diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 08151583..4a993ea3 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -149,6 +149,13 @@ class Multi2Delaf { const unichar* multidelaf_string) const; unichar* retrieve_semantic_codes( const std::vector& delaf_tags) const; + static unichar* complete_first_with_second(const unichar* first, + const unichar* second); + static struct list_ustring* clone_and_replace_copy_command( + const struct list_ustring* inflectional_command, + const struct dela_entry* tag); + static struct list_ustring* product(struct list_ustring* l1, + struct list_ustring* l2); unichar* retrieve_inflectional_codes( const std::vector& delaf_tags) const; static unichar* build_output_codes(const struct list_ustring* list, From 05b2b2fea008b4ffd2fc0580745cca5f95fc4530 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 24 Jul 2022 11:27:38 +0200 Subject: [PATCH 06/24] Improve list_ustring usages --- src/Multi2Delaf.cpp | 44 ++++++++------------------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 6e382d8a..88092c21 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -218,11 +218,7 @@ struct list_ustring* ConfigCommand::tokenize_semantic_codes( if (*next_code == '+') { unichar* new_code = tokenize_one_semantic_code(&next_code, config_filename); - if (codes == nullptr) { - codes = new_list_ustring(new_code); - } else if (!is_in_list(new_code, codes)) { - insert_at_end_of_list(new_code, codes); - } + codes = sorted_insert(new_code, codes); free(new_code); } } @@ -271,11 +267,7 @@ struct list_ustring* ConfigCommand::tokenize_inflectional_codes( if (*next_code == ':') { unichar* new_code = tokenize_one_inflectional_code(&next_code, config_filename); - if (codes == nullptr) { - codes = new_list_ustring(new_code); - } else if (!is_in_list(new_code, codes)) { - insert_at_end_of_list(new_code, codes); - } + codes = sorted_insert(new_code, codes); free(new_code); } } @@ -763,19 +755,11 @@ unichar* Multi2Delaf::retrieve_semantic_codes( if (u_strcmp(ptr_command->string, ConfigCommand::PLUS_COPY) == 0) { for (int i = 1; i < tag->n_semantic_codes; i++) { // begin at 1 to skip the grammatical catergory - if (codes == nullptr) { - codes = new_list_ustring(tag->semantic_codes[i]); - } else if (!is_in_list(tag->semantic_codes[i], codes)) { - insert_at_end_of_list(tag->semantic_codes[i], codes); - } + codes = sorted_insert(tag->semantic_codes[i], codes); } } else { if (line->get_nb_required_tag() != 0) { - if (codes == nullptr) { - codes = new_list_ustring(ptr_command->string); - } else if (!is_in_list(ptr_command->string, codes)) { - insert_at_end_of_list(ptr_command->string, codes); - } + codes = sorted_insert(ptr_command->string, codes); } } } @@ -788,13 +772,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( struct list_ustring* ptr_command = line->get_config_command()->get_semantic_codes(); while (ptr_command != nullptr) { - if (codes == nullptr) { - codes = new_list_ustring(ptr_command->string); - } else { - if (!is_in_list(ptr_command->string, codes)) { - insert_at_end_of_list(ptr_command->string, codes); - } - } + codes = sorted_insert(ptr_command->string, codes); ptr_command = ptr_command->next; } } @@ -838,14 +816,10 @@ struct list_ustring* Multi2Delaf::clone_and_replace_copy_command( while (inflectional_command != nullptr) { if (u_strcmp(inflectional_command->string, ConfigCommand::COLUMN_COPY) != 0) { - if (!is_in_list(inflectional_command->string, res)) { - res = insert_at_end_of_list(inflectional_command->string, res); - } + res = sorted_insert(inflectional_command->string, res); } else { for (int i = 0; i < tag->n_inflectional_codes; i++) { - if (!is_in_list(tag->inflectional_codes[i], res)) { - res = insert_at_end_of_list(tag->inflectional_codes[i], res); - } + res = sorted_insert(tag->inflectional_codes[i], res); } } inflectional_command = inflectional_command->next; @@ -871,9 +845,7 @@ struct list_ustring* Multi2Delaf::product(struct list_ustring* l1, ptr_l2 = l2; while (ptr_l2 != nullptr) { tmp_code = complete_first_with_second(ptr_l1->string, ptr_l2->string); - if (!is_in_list(tmp_code, res)) { - res = insert_at_end_of_list(tmp_code, res); - } + res = sorted_insert(tmp_code, res); free(tmp_code); ptr_l2 = ptr_l2->next; } From c1a36dc9048f46a36439dd294cd23861db0f7f65 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 14:12:34 +0200 Subject: [PATCH 07/24] Add Multi2Delaf function calls in Fst2List --- src/Fst2List.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++---- src/build/Makefile | 4 +-- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/src/Fst2List.cpp b/src/Fst2List.cpp index c8715fa1..98ca4a46 100644 --- a/src/Fst2List.cpp +++ b/src/Fst2List.cpp @@ -39,6 +39,7 @@ #include "MorphologicalLocate.h" #include "Korean.h" #include "Dico.h" +#include "Multi2Delaf.h" #ifndef HAS_UNITEX_NAMESPACE #define HAS_UNITEX_NAMESPACE 1 @@ -84,6 +85,9 @@ const char* usage_Fst2List = "-D , morphological dictionary file to load, must have the extension \".bin\"\r\n" "-E /--elg_extensions_path=: uses ELGs extensions directory X instead of App/elg\n" "-V, --only_verify_arguments: only verify arguments syntax and exit\r\n" + "-M, indicate that the argument is a morphological dictionary-graph (-M requires -P)\r\n" + "-C : use to compile morphological dictionary-graph into DELAF dictionary\r\n" + "(-C requires -M, otherwise produces a fatal_error)\r\n" "-h, --help: display this help and exit"; static void usage() { @@ -295,6 +299,10 @@ class CFstApp { bool isMdg; // true if the graph is a morphological dictionary-graph struct hash_table* path_to_stop; /* a hash table to know all the Fst2Tag whose path exploration must be interrupted */ struct hash_table* dela_entries; /* a hash table to get the dela_entries of created boxes when lexical masks are processed */ + bool compileToDelaf = false; + Multi2Delaf *multi2Delaf = nullptr; + bool isMorphological = false; + bool makeDic = false; void fileNameSet(char *ifn, char *ofn) { char tmp[512]; @@ -400,6 +408,9 @@ class CFstApp { free(numOfIgnore); } deleteCallIdMap(); + if (multi2Delaf != nullptr) { + delete multi2Delaf; + } } ; void resetCounters() { @@ -960,9 +971,14 @@ class CFstApp { Hanguls_to_Jamos(INPUTBUFFER, jamos, korean, 1); convert_jamo_to_hangul(jamos, INPUTBUFFER, korean); } - u_fputs(INPUTBUFFER, foutput); + if (!isMorphological || !compileToDelaf) { + u_fputs(INPUTBUFFER, foutput); + } if ((automateMode == TRANMODE) && outBufferCnt) { OUTPUTBUFFER[outBufferCnt] = 0; + if (compileToDelaf) { + multi2Delaf->translate_multidelaf_to_delaf(INPUTBUFFER, OUTPUTBUFFER); + } u_fprintf(foutput, "%S%S", saveSep, OUTPUTBUFFER); } if (display_control == FST2LIST_DEBUG) { @@ -2518,6 +2534,11 @@ int CFstApp::outWordsOfGraph(int depth) { if (Tag->output != NULL) { outputBufferPtr = (u_strcmp(Tag->output, u_epsilon_string)) ? Tag->output : u_null_string; if(!u_strcmp(Tag->output, "/")) { // if the output is '/', it's a MDG, this output is not put in the outputfile + if (makeDic && !isMorphological) { + fatal_error( + "Current graph is a morphological dictionary graph (maybe in " + "some paths)\n"); + } isMdg = true; outputBufferPtr = u_null_string; } @@ -2525,6 +2546,11 @@ int CFstApp::outWordsOfGraph(int depth) { outputBufferPtr = u_null_string; } else{ + if (makeDic && !isMdg && isMorphological) { + fatal_error( + "Current graph is not a morphological dictionary graph " + "(maybe in some paths)\n"); + } value = get_value(dela_entries, Tag, HT_DONT_INSERT, &return_value); if(return_value == HT_KEY_ALREADY_THERE && Tag->output[0] == (unichar)'$' && Tag->output[u_strlen(Tag->output) - 1] == (unichar)'$') @@ -2874,7 +2900,7 @@ int CFstApp::outWordsOfGraph(int depth) { // // -const char* optstring_Fst2List=":o:Sp:a:t:l:i:mdf:vVKPhs:q:r:c:g:D:Q:E:"; +const char* optstring_Fst2List=":o:Sp:a:t:l:i:mdf:vVKPhs:q:r:c:g:D:Q:E:C:M"; const struct option_TS lopts_Fst2List[]= { {"output",required_argument_TS,NULL,'o'}, {"ignore_outputs",required_argument_TS,NULL,'a'}, @@ -2900,12 +2926,15 @@ const struct option_TS lopts_Fst2List[]= { {"help",no_argument_TS,NULL,'h'}, {"binary dics",required_argument_TS,NULL,'D'}, {"elg_extensions_path",required_argument_TS,NULL,'E'}, + {"compile_into_delaf", required_argument_TS, NULL, 'C'}, + {"is_morphological_dictionary_graph", no_argument_TS, NULL, 'M'}, {NULL,no_argument_TS,NULL,0} }; int main_Fst2List(int argc, char* const argv[]) { char* ofilename = NULL; char morpho_dic[1025] = ""; + char *config_file_name = NULL; unichar changeStrTo[16][MAX_CHANGE_SYMBOL_SIZE]; int changeStrToIdx; @@ -2924,7 +2953,6 @@ int main_Fst2List(int argc, char* const argv[]) { bool only_verify_arguments = false; UnitexGetOpt options; VersatileEncodingConfig vec = VEC_DEFAULT; - bool makeDic = false; char elg_extensions_path[FILENAME_MAX]=""; @@ -2967,7 +2995,7 @@ int main_Fst2List(int argc, char* const argv[]) { strcpy(elg_extensions_path,options.vars()->optarg); break; case 'P': - makeDic = true; + aa.makeDic = true; break; case 'S': ofilename = (char *)malloc((strlen(MAGIC_OUT_STDOUT) + 1) * sizeof(char)); @@ -3217,6 +3245,21 @@ int main_Fst2List(int argc, char* const argv[]) { &(vec.bom_output), arg); break; } + case 'C': { + config_file_name = (char *)malloc( + (strlen((char *)&options.vars()->optarg[0]) + 1) * sizeof(char)); + if (config_file_name == NULL) { + fatal_alloc_error("main_Fst2List"); + } + strcpy(config_file_name, (char *)&options.vars()->optarg[0]); + aa.compileToDelaf = true; + aa.multi2Delaf = new Multi2Delaf(config_file_name); + break; + } + case 'M': { + aa.isMorphological = true; + break; + } case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Fst2List[index].name); return USAGE_ERROR_CODE; @@ -3237,6 +3280,11 @@ int main_Fst2List(int argc, char* const argv[]) { free(ofilename); return SUCCESS_RETURN_CODE; } + if (aa.compileToDelaf && !aa.isMorphological) { + free(ofilename); + free(config_file_name); + fatal_error("Internal error: Bad use of -C option\r\n"); + } strcpy(fst2_filename,argv[options.vars()->optind]); aa.fileNameSet(argv[options.vars()->optind], ofilename); @@ -3312,9 +3360,12 @@ int main_Fst2List(int argc, char* const argv[]) { aa.p->elg->setup_local_environment(); // -------------------------------------------------------------------------- - if(makeDic) { + if(aa.makeDic) { aa.setGrammarMode(fst2_filename); } + if (aa.compileToDelaf) { + aa.multi2Delaf->parse_config_file(); + } aa.getWordsFromGraph(changeStrToIdx, changeStrTo, fst2_filename); // -------------------------------------------------------------------------- @@ -3323,6 +3374,7 @@ int main_Fst2List(int argc, char* const argv[]) { // -------------------------------------------------------------------------- free(ofilename); + free(config_file_name); free_stack_unichar(aa.p->literal_output); free_stack_unichar(aa.p->stack_elg); diff --git a/src/build/Makefile b/src/build/Makefile index b6eec6d4..b140a6a9 100644 --- a/src/build/Makefile +++ b/src/build/Makefile @@ -1152,7 +1152,7 @@ FST2LIST_OBJS = Main_Fst2List.o Fst2List.o IOBuffer.o Copyright.o Af_stdio.o Act StringParsing.o Pattern.o List_ustring.o List_int.o BitMasks.o \ Transitions.o DELA.o Symbol.o Symbol_op.o LanguageDefinition.o \ Ustring.o Tagset.o AbstractDelaLoad.o PackInf.o CompressedDic.o LoadInf.o \ - VirtualFiles.o Persistence.o UnitexRevisionInfo.o UnitexGetOpt.o \ + VirtualFiles.o Persistence.o UnitexRevisionInfo.o UnitexGetOpt.o Multi2Delaf.o \ $(ADDITIONAL_OBJECT) $(UNITEX_BASE_OBJECT) $(UNITEX_ELGLIB_OBJECT) $(VIRTOPTIMIZATION_OBJECT) $(SYSLIBMAPPED) $(SYSLIBSYNCTOOL) $(SYSLIBDIRIO) FST2TXT = Fst2Txt @@ -1583,7 +1583,7 @@ UNITEXTOOL_LOGGER_NO_MAIN_OBJS = FilePack.o FilePackCrc32.o FilePackIo.o UniLogg GrfSvn_lib.o DebugMode.o GrfBeauty.o GrfTest.o GrfTest_lib.o SpellCheck.o \ SpellChecking.o Keyboard.o VirtualFiles.o Persistence.o PersistenceInterface.o PersistResource.o \ TfstTag.o PRLG.o KeyWords.o KeyWords_lib.o \ - RegExFacade.o SelectOutput.o UnitexLibIO.o $(TRE_LINK_OBJS) \ + RegExFacade.o SelectOutput.o UnitexLibIO.o Multi2Delaf.o $(TRE_LINK_OBJS) \ $(YAML_LINK_OBJS) $(SYSLIBDIRIO) $(ADDITIONAL_OBJECT) $(UNITEX_BASE_OBJECT) $(UNITEX_ELGLIB_OBJECT) $(VIRTOPTIMIZATION_OBJECT) $(SYSLIBMAPPED) $(SYSLIBSYNCTOOL) $(SYSLIBLOGGER) UNITEXTOOL_LOGGER = UnitexToolLogger From 086f4f5ea194e622827ca0540f058637c6165483 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 16:12:10 +0200 Subject: [PATCH 08/24] Change '= delete' keyword for UNITEX_EQ_DELETE --- src/Multi2Delaf.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 4a993ea3..f4bbc7a0 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -29,6 +29,7 @@ #include #include +#include "base/compiler/keyword/eq_delete.h" #include "DELA.h" #include "Pattern.h" #include "Unicode.h" @@ -54,10 +55,10 @@ class ConfigCommand { struct list_ustring* semantic_codes, struct list_ustring* inflectional_codes); // Not copyable or movable - ConfigCommand(const ConfigCommand&) = delete; - ConfigCommand(ConfigCommand&&) = delete; - ConfigCommand& operator=(const ConfigCommand& other) = delete; - ConfigCommand& operator=(ConfigCommand&& other) = delete; + ConfigCommand(const ConfigCommand&) UNITEX_EQ_DELETE; + ConfigCommand(ConfigCommand&&) UNITEX_EQ_DELETE; + ConfigCommand& operator=(const ConfigCommand& other) UNITEX_EQ_DELETE; + ConfigCommand& operator=(ConfigCommand&& other) UNITEX_EQ_DELETE; ~ConfigCommand(); unichar* get_lemma() const; unichar* get_part_of_speech() const; @@ -97,10 +98,10 @@ class ConfigLine { ConfigLine(struct pattern* pattern, int nb_required_tag, std::shared_ptr _config_command); // Not copyable or movable - ConfigLine(const ConfigLine&) = delete; - ConfigLine(ConfigLine&&) = delete; - ConfigLine& operator=(const ConfigLine& other) = delete; - ConfigLine& operator=(ConfigLine&& other) = delete; + ConfigLine(const ConfigLine&) UNITEX_EQ_DELETE; + ConfigLine(ConfigLine&&) UNITEX_EQ_DELETE; + ConfigLine& operator=(const ConfigLine& other) UNITEX_EQ_DELETE; + ConfigLine& operator=(ConfigLine&& other) UNITEX_EQ_DELETE; ~ConfigLine(); static int advance_to_next_no_blank_char(unichar** str); struct pattern* get_pattern() const; @@ -125,10 +126,10 @@ class Multi2Delaf { public: Multi2Delaf(const char* config_filename); // Not copyable or movable - Multi2Delaf(const Multi2Delaf&) = delete; - Multi2Delaf(Multi2Delaf&&) = delete; - Multi2Delaf& operator=(const Multi2Delaf& other) = delete; - Multi2Delaf& operator=(Multi2Delaf&& other) = delete; + Multi2Delaf(const Multi2Delaf&) UNITEX_EQ_DELETE; + Multi2Delaf(Multi2Delaf&&) UNITEX_EQ_DELETE; + Multi2Delaf& operator=(const Multi2Delaf& other) UNITEX_EQ_DELETE; + Multi2Delaf& operator=(Multi2Delaf&& other) UNITEX_EQ_DELETE; void parse_config_file(); void translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const; From 877e2a973cc3e562b11c3201f77a2e945757588d Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 18:42:43 +0200 Subject: [PATCH 09/24] Change ConfigCommand class and ConfigLine class into structures --- src/Multi2Delaf.cpp | 483 +++++++++++++++++++++----------------------- src/Multi2Delaf.h | 92 ++------- 2 files changed, 254 insertions(+), 321 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 88092c21..82ed6d4f 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -21,6 +21,8 @@ #include "Multi2Delaf.h" +#include + #include #include #include @@ -41,79 +43,21 @@ namespace unitex { - -/*================================================================= - * ConfigCommand class method - *================================================================= */ - /** - * Tokenize a config command. - * Assumes that str ends by '\0'. - * Assumes that empty space are removed from the begin, - * using ConfigLine::advances_to_next_no_blank_char. - * Raise a fatal error in case of malformed command. + * Set *str to the next no blank character. + * Assumes that *str ended with '\0'. + * Returns 1 if *str ends with blank char, otherwise returns 0. */ -std::unique_ptr ConfigCommand::tokenize_config_command( - unichar* str, const char* config_filename) { - unichar* lemma = nullptr; - unichar* part_of_speech = nullptr; - struct list_ustring* semantic_codes = nullptr; - struct list_ustring* inflectional_codes = nullptr; - unichar* ptr = str; - if (str[0] == ',') { - lemma = tokenize_lemma(&ptr, config_filename); - } - if (ptr[0] == '.') { - part_of_speech = tokenize_part_of_speech(&ptr, config_filename); - } - if (ptr[0] == '+') { - semantic_codes = tokenize_semantic_codes(&ptr, config_filename); - } - if (ptr[0] == ':') { - inflectional_codes = tokenize_inflectional_codes(ptr, config_filename); - } - return std::make_unique(lemma, part_of_speech, semantic_codes, - inflectional_codes); -} - -ConfigCommand::ConfigCommand(unichar* lemma, unichar* part_of_speech, - struct list_ustring* semantic_codes, - struct list_ustring* inflectional_codes) - : _lemma{lemma}, - _part_of_speech{part_of_speech}, - _semantic_codes{semantic_codes}, - _inflectional_codes{inflectional_codes} { -} - -ConfigCommand::~ConfigCommand() { - if (_lemma) { - free(_lemma); - } - if (_part_of_speech) { - free(_part_of_speech); - } - if (_semantic_codes) { - free_list_ustring(_semantic_codes); +int advance_to_next_no_blank_char(unichar** str) { + int i = 0; + while ((*str)[i] != '\0' && ((*str)[i] == ' ' || (*str)[i] == '\t')) { + i++; } - if (_inflectional_codes) { - free_list_ustring(_inflectional_codes); + if ((*str)[i] == '\0') { + return 1; } -} - -unichar* ConfigCommand::get_lemma() const { - return _lemma; -} - -unichar* ConfigCommand::get_part_of_speech() const { - return _part_of_speech; -} - -struct list_ustring* ConfigCommand::get_semantic_codes() const { - return _semantic_codes; -} - -struct list_ustring* ConfigCommand::get_inflectional_codes() const { - return _inflectional_codes; + *str = *str + i; + return 0; } /** @@ -121,21 +65,20 @@ struct list_ustring* ConfigCommand::get_inflectional_codes() const { * Set *ptr to the next unread character. * Assumes that (*ptr)[0] == ','. */ -unichar* ConfigCommand::tokenize_lemma(unichar** ptr, - const char* config_filename) { +unichar* tokenize_lemma(unichar** ptr, const char* config_filename) { unichar* line = *ptr; // try to read ,,copy if (line[0] == ',' && line[1] == ',') { - if (!u_starts_with(line + 1, COMMA_COPY)) { + if (!u_starts_with(line + 1, Multi2Delaf::COMMA_COPY)) { fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(COMMA_COPY); + *ptr = line + 1 + strlen(Multi2Delaf::COMMA_COPY); if (**ptr != '\0' && **ptr != '.' && **ptr != '+' && **ptr != ':') { fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(COMMA_COPY); + return u_strdup(Multi2Delaf::COMMA_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '.' && line[i] != '+' && @@ -151,21 +94,20 @@ unichar* ConfigCommand::tokenize_lemma(unichar** ptr, * Set *ptr to the next unread character. * Assumes that (*ptr)[0] == '.'. */ -unichar* ConfigCommand::tokenize_part_of_speech(unichar** ptr, - const char* config_filename) { +unichar* tokenize_part_of_speech(unichar** ptr, const char* config_filename) { unichar* line = *ptr; // try to read ..copy if (line[0] == '.' && line[1] == '.') { - if (!u_starts_with(line + 1, DOT_COPY)) { + if (!u_starts_with(line + 1, Multi2Delaf::DOT_COPY)) { fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(DOT_COPY); + *ptr = line + 1 + strlen(Multi2Delaf::DOT_COPY); if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(DOT_COPY); + return u_strdup(Multi2Delaf::DOT_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { @@ -180,21 +122,21 @@ unichar* ConfigCommand::tokenize_part_of_speech(unichar** ptr, * Set *ptr to the next unread character. * Assumes that (*ptr)[0] == '+'. */ -unichar* ConfigCommand::tokenize_one_semantic_code( - unichar** ptr, const char* config_filename) { +unichar* tokenize_one_semantic_code(unichar** ptr, + const char* config_filename) { unichar* line = *ptr; // try to read ++copy if (line[0] == '+' && line[1] == '+') { - if (!u_starts_with(line + 1, PLUS_COPY)) { + if (!u_starts_with(line + 1, Multi2Delaf::PLUS_COPY)) { fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(PLUS_COPY); + *ptr = line + 1 + strlen(Multi2Delaf::PLUS_COPY); if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(PLUS_COPY); + return u_strdup(Multi2Delaf::PLUS_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { @@ -209,8 +151,8 @@ unichar* ConfigCommand::tokenize_one_semantic_code( * Set *ptr to the next unread character. * Assumes that (*ptr)[0] == '+'. */ -struct list_ustring* ConfigCommand::tokenize_semantic_codes( - unichar** ptr, const char* config_filename) { +struct list_ustring* tokenize_semantic_codes(unichar** ptr, + const char* config_filename) { unichar* line = *ptr; unichar* next_code = line; struct list_ustring* codes = nullptr; @@ -231,21 +173,21 @@ struct list_ustring* ConfigCommand::tokenize_semantic_codes( * Set *ptr to the next unread character. * Assumes that (*ptr)[0] == ':'. */ -unichar* ConfigCommand::tokenize_one_inflectional_code( - unichar** ptr, const char* config_filename) { +unichar* tokenize_one_inflectional_code(unichar** ptr, + const char* config_filename) { unichar* line = *ptr; // try to read ::copy if (line[0] == ':' && line[1] == ':') { - if (!u_starts_with(line + 1, COLUMN_COPY)) { + if (!u_starts_with(line + 1, Multi2Delaf::COLUMN_COPY)) { fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(COLUMN_COPY); + *ptr = line + 1 + strlen(Multi2Delaf::COLUMN_COPY); if (**ptr != '\0' && **ptr != ':') { fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(COLUMN_COPY); + return u_strdup(Multi2Delaf::COLUMN_COPY); } int i = 1; while (line[i] != '\0' && line[i] != ':') { @@ -259,8 +201,8 @@ unichar* ConfigCommand::tokenize_one_inflectional_code( * Return a new allocated list of inflectional codes. * Assumes that str[0] == ':'. */ -struct list_ustring* ConfigCommand::tokenize_inflectional_codes( - unichar* str, const char* config_filename) { +struct list_ustring* tokenize_inflectional_codes(unichar* str, + const char* config_filename) { struct list_ustring* codes = nullptr; unichar* next_code = str; while (*next_code != '\0') { @@ -274,119 +216,81 @@ struct list_ustring* ConfigCommand::tokenize_inflectional_codes( return codes; } -/*================================================================= - * ConfigLine class method - *================================================================= */ +struct ConfigCommand* new_config_command( + unichar* lemma, unichar* part_of_speech, + struct list_ustring* semantic_codes, + struct list_ustring* inflectional_codes) { + struct ConfigCommand* command = NULL; + if (NULL == + (command = (struct ConfigCommand*)malloc(sizeof(struct ConfigCommand)))) { + fatal_alloc_error("new_config_command"); + } + command->lemma = lemma; + command->part_of_speech = part_of_speech; + command->semantic_codes = semantic_codes; + command->inflectional_codes = inflectional_codes; + return command; +} /** - * Tokenize a config line. - * Returns a std::unique_ptr if there is a well-formed line. - * otherwise returns nullptr. - * Raises a fatal error in case of malformed line. + * Tokenize a config command. + * Assumes that str ends by '\0'. + * Assumes that empty space are removed from the begin, + * using advance_to_next_no_blank_char. + * Raise a fatal error in case of malformed command. */ -std::unique_ptr ConfigLine::tokenize_config_line( - unichar* line, const char* config_filename) { - unichar* nextNoEmptyUnichar = line; - unichar patternToken[INPUTSIZEBUFFER] = {0}; - if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { - return nullptr; // skip empty line +struct ConfigCommand* tokenize_config_command(unichar* str, + const char* config_filename) { + unichar* lemma = nullptr; + unichar* part_of_speech = nullptr; + struct list_ustring* semantic_codes = nullptr; + struct list_ustring* inflectional_codes = nullptr; + unichar* ptr = str; + if (str[0] == ',') { + lemma = tokenize_lemma(&ptr, config_filename); } - if (nextNoEmptyUnichar[0] == '#') { - return nullptr; // skip comment line + if (ptr[0] == '.') { + part_of_speech = tokenize_part_of_speech(&ptr, config_filename); } - if (recognize_pattern_token(&nextNoEmptyUnichar, patternToken)) { - fatal_error( - "Lexical mask must be enclosed in < >, like in file: %s, " - "line: '%S'\n", - config_filename, line); + if (ptr[0] == '+') { + semantic_codes = tokenize_semantic_codes(&ptr, config_filename); } - // build pattern - struct pattern* pattern = build_pattern(patternToken, nullptr, 0, nullptr); - if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { - // if there is no command, we skip the current line like comment line - return nullptr; + if (ptr[0] == ':') { + inflectional_codes = tokenize_inflectional_codes(ptr, config_filename); } - // build nb_required_tag - int nb_required_tag = NOT_SPECIFIED; - if (nextNoEmptyUnichar[0] == '{') { - if (tokenize_nb_required_tag(&nextNoEmptyUnichar, &nb_required_tag)) { - fatal_error("Braces must contain number in file: %s, line: '%S'\n", - config_filename, line); - } + return new_config_command(lemma, part_of_speech, semantic_codes, + inflectional_codes); +} + +void free_config_command(struct ConfigCommand* command) { + if (command->lemma) { + free(command->lemma); } - if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { - // if there is no command, we skip the current line like comment line - return nullptr; + if (command->part_of_speech) { + free(command->part_of_speech); } - // build config command - auto config_command = ConfigCommand::tokenize_config_command( - nextNoEmptyUnichar, config_filename); - if (u_strcmp(config_command->get_lemma(), ConfigCommand::COMMA_COPY) == 0 && - nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { - fatal_error( - "Command ,,copy is incompatible with an integer enclosed in curly " - "braces, except for {1} in file: %s, line: '%S'\n", - config_filename, line); + if (command->semantic_codes) { + free_list_ustring(command->semantic_codes); } - if (u_strcmp(config_command->get_part_of_speech(), ConfigCommand::DOT_COPY) == - 0 && - nb_required_tag != NOT_SPECIFIED && nb_required_tag != 1) { - fatal_error( - "Command ..copy is incompatible with an integer enclosed in curly " - "braces, except for {1}in file:%s, line: '%S'\n", - config_filename, line); + if (command->inflectional_codes) { + free_list_ustring(command->inflectional_codes); } - return std::make_unique(pattern, nb_required_tag, - std::move(config_command)); + free(command); } -ConfigLine::ConfigLine(struct pattern* pattern, int nb_required_tag, - std::shared_ptr config_command) - : _pattern{pattern}, - _nb_required_tag{nb_required_tag}, - _config_command{std::move(config_command)} { -} -ConfigLine::~ConfigLine() { - free_pattern(_pattern); -} - -struct pattern* ConfigLine::get_pattern() const { - return _pattern; -} - -int ConfigLine::get_nb_required_tag() const { - return _nb_required_tag; -} - -std::shared_ptr ConfigLine::get_config_command() const { - return _config_command; -} -/** - * Set *str to the next no blank character. - * Assumes that *str ended with '\0'. - * Returns 1 if *str ends with blank char, otherwise returns 0. - */ -int ConfigLine::advance_to_next_no_blank_char(unichar** str) { - int i = 0; - while ((*str)[i] != '\0' && ((*str)[i] == ' ' || (*str)[i] == '\t')) { - i++; - } - if ((*str)[i] == '\0') { - return 1; - } - *str = *str + i; - return 0; -} +/*================================================================= + * ConfigLine class method + *================================================================= */ /** * Return 1 if error occurs, otherwise 0. * Assumes that pattern is suround by < >. * Assumes that empty space are removed from the begin, - * using ConfigLine::advances_to_next_no_blank_char. + * using advance_to_next_no_blank_char. */ -int ConfigLine::recognize_pattern_token(unichar** ptr, unichar* res) { +int recognize_pattern_token(unichar** ptr, unichar* res) { unichar* line = *ptr; int index_line = 0; int index_res = 0; @@ -408,12 +312,13 @@ int ConfigLine::recognize_pattern_token(unichar** ptr, unichar* res) { return 1; } + /** * Return 1 if error occurs, otherwise 0. * Assumes that empty space are removed from the begin, - * using ConfigLine::advances_to_next_no_blank_char. + * using advance_to_next_no_blank_char. */ -int ConfigLine::tokenize_nb_required_tag(unichar** ptr, int* res) { +int tokenize_nb_required_tag(unichar** ptr, int* res) { unichar* line = *ptr; int index = 0; int index_buffer = 0; @@ -444,6 +349,85 @@ int ConfigLine::tokenize_nb_required_tag(unichar** ptr, int* res) { return 1; } +struct ConfigLine* new_config_line(struct pattern* pattern, int nb_required_tag, + struct ConfigCommand* config_command) { + struct ConfigLine* line = NULL; + if (NULL == (line = (struct ConfigLine*)malloc(sizeof(struct ConfigLine)))) { + fatal_alloc_error("new_config_line"); + } + line->pattern = pattern; + line->nb_required_tag = nb_required_tag; + line->config_command = config_command; + return line; +} + +/** + * Tokenize a config line. + * Returns a std::unique_ptr if there is a well-formed line. + * otherwise returns nullptr. + * Raises a fatal error in case of malformed line. + */ +struct ConfigLine* tokenize_config_line(unichar* line, + const char* config_filename) { + unichar* nextNoEmptyUnichar = line; + unichar patternToken[INPUTSIZEBUFFER] = {0}; + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + return nullptr; // skip empty line + } + if (nextNoEmptyUnichar[0] == '#') { + return nullptr; // skip comment line + } + if (recognize_pattern_token(&nextNoEmptyUnichar, patternToken)) { + fatal_error( + "Lexical mask must be enclosed in < >, like in file: %s, " + "line: '%S'\n", + config_filename, line); + } + // build pattern + struct pattern* pattern = build_pattern(patternToken, nullptr, 0, nullptr); + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + // if there is no command, we skip the current line like comment line + return nullptr; + } + // build nb_required_tag + int nb_required_tag = Multi2Delaf::NOT_SPECIFIED; + if (nextNoEmptyUnichar[0] == '{') { + if (tokenize_nb_required_tag(&nextNoEmptyUnichar, &nb_required_tag)) { + fatal_error("Braces must contain number in file: %s, line: '%S'\n", + config_filename, line); + } + } + if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { + // if there is no command, we skip the current line like comment line + return nullptr; + } + // build config command + struct ConfigCommand* config_command = + tokenize_config_command(nextNoEmptyUnichar, config_filename); + if (u_strcmp(config_command->lemma, Multi2Delaf::COMMA_COPY) == 0 && + nb_required_tag != Multi2Delaf::NOT_SPECIFIED && nb_required_tag != 1) { + fatal_error( + "Command ,,copy is incompatible with an integer enclosed in curly " + "braces, except for {1} in file: %s, line: '%S'\n", + config_filename, line); + } + if (u_strcmp(config_command->part_of_speech, Multi2Delaf::DOT_COPY) == 0 && + nb_required_tag != Multi2Delaf::NOT_SPECIFIED && nb_required_tag != 1) { + fatal_error( + "Command ..copy is incompatible with an integer enclosed in curly " + "braces, except for {1}in file:%s, line: '%S'\n", + config_filename, line); + } + return new_config_line(pattern, nb_required_tag, config_command); +} + +void free_config_line(struct ConfigLine* line) { + free_pattern(line->pattern); + free(line); + free_config_command(line->config_command); +} + + /*================================================================= * Multi2Delaf class method @@ -453,6 +437,12 @@ Multi2Delaf::Multi2Delaf(const char* config_filename) : _config_filename{config_filename} { } +Multi2Delaf::~Multi2Delaf() { + for (auto& lines : _config_lines) { + free_config_line(lines); + } +} + /** * Read the configuration file that specifies how to transcode the * multidelaf string into a delaf tag. @@ -552,15 +542,15 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { int eof = 0; while (EOF != (eof = read_line_config_file(config_file, line, INPUTSIZEBUFFER))) { - auto config_line = ConfigLine::tokenize_config_line( - line, filename_without_path(_config_filename)); + auto config_line = + tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { _config_lines.emplace_back(std::move(config_line)); } } // the last line is potentially a config line - auto config_line = ConfigLine::tokenize_config_line( - line, filename_without_path(_config_filename)); + auto config_line = + tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { _config_lines.emplace_back(std::move(config_line)); } @@ -575,7 +565,7 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { unichar* line = *ptr; unichar* next_no_blank_char = line; - if (ConfigLine::advance_to_next_no_blank_char(&next_no_blank_char)) { + if (advance_to_next_no_blank_char(&next_no_blank_char)) { return nullptr; // end of the line, no more dela_entry } if (next_no_blank_char[0] != '{') { @@ -644,15 +634,15 @@ unichar* Multi2Delaf::retrieve_lemma( const std::vector& delaf_tags, const unichar* multidelaf_string) const { for (const auto& current_line : _config_lines) { - if (current_line->get_config_command()->get_lemma() == nullptr) { + if (current_line->config_command->lemma == nullptr) { continue; } for (const auto& tag : delaf_tags) { - if (is_entry_compatible_with_pattern(tag, current_line->get_pattern())) { - if (u_strcmp(current_line->get_config_command()->get_lemma(), - ConfigCommand::COMMA_COPY) == 0) { - if (nb_delaf_tag_that_match_pattern( - delaf_tags, current_line->get_pattern()) != 1) { + if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { + if (u_strcmp(current_line->config_command->lemma, + Multi2Delaf::COMMA_COPY) == 0) { + if (nb_delaf_tag_that_match_pattern(delaf_tags, + current_line->pattern) != 1) { fatal_error( "Command ,,copy can be interpreted for several delaf line: " "%S\n", @@ -660,19 +650,19 @@ unichar* Multi2Delaf::retrieve_lemma( } return u_strdup(tag->lemma); } - if (current_line->get_nb_required_tag() == 1 || - current_line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED) { - return u_strdup(current_line->get_config_command()->get_lemma()); + if (current_line->nb_required_tag == 1 || + current_line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED) { + return u_strdup(current_line->config_command->lemma); } - if (current_line->get_nb_required_tag() == + if (current_line->nb_required_tag == nb_delaf_tag_that_match_pattern(delaf_tags, - current_line->get_pattern())) { - return u_strdup(current_line->get_config_command()->get_lemma()); + current_line->pattern)) { + return u_strdup(current_line->config_command->lemma); } } } - if (current_line->get_nb_required_tag() == 0) { - return u_strdup(current_line->get_config_command()->get_lemma()); + if (current_line->nb_required_tag == 0) { + return u_strdup(current_line->config_command->lemma); } } fatal_error("No lemma is provided for this multidelaf string: %S\n", @@ -690,36 +680,34 @@ unichar* Multi2Delaf::retrieve_part_of_speech( const std::vector& delaf_tags, const unichar* multidelaf_string) const { for (const auto& current_line : _config_lines) { - if (current_line->get_config_command()->get_part_of_speech() == nullptr) { + if (current_line->config_command->part_of_speech == nullptr) { continue; } for (const auto& tag : delaf_tags) { - if (is_entry_compatible_with_pattern(tag, current_line->get_pattern())) { - if (u_strcmp(current_line->get_config_command()->get_part_of_speech(), - ConfigCommand::DOT_COPY) == 0) { - if (nb_delaf_tag_that_match_pattern( - delaf_tags, current_line->get_pattern()) != 1) { + if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { + if (u_strcmp(current_line->config_command->part_of_speech, + Multi2Delaf::DOT_COPY) == 0) { + if (nb_delaf_tag_that_match_pattern(delaf_tags, + current_line->pattern) != 1) { fatal_error( "Command ..copy can be interpreted for several delaf tag: %S\n", multidelaf_string); } return u_strdup(tag->semantic_codes[0]); } - if (current_line->get_nb_required_tag() == 1 || - current_line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED) { - return u_strdup( - current_line->get_config_command()->get_part_of_speech()); + if (current_line->nb_required_tag == 1 || + current_line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED) { + return u_strdup(current_line->config_command->part_of_speech); } - if (current_line->get_nb_required_tag() == + if (current_line->nb_required_tag == nb_delaf_tag_that_match_pattern(delaf_tags, - current_line->get_pattern())) { - return u_strdup( - current_line->get_config_command()->get_part_of_speech()); + current_line->pattern)) { + return u_strdup(current_line->config_command->part_of_speech); } } } - if (current_line->get_nb_required_tag() == 0) { - return u_strdup(current_line->get_config_command()->get_part_of_speech()); + if (current_line->nb_required_tag == 0) { + return u_strdup(current_line->config_command->part_of_speech); } } fatal_error( @@ -742,23 +730,22 @@ unichar* Multi2Delaf::retrieve_semantic_codes( for (const auto& tag : delaf_tags) { for (const auto& line : _config_lines) { - if (line->get_config_command()->get_semantic_codes() == nullptr) { + if (line->config_command->semantic_codes == nullptr) { continue; } - if (is_entry_compatible_with_pattern(tag, line->get_pattern())) { - ptr_command = line->get_config_command()->get_semantic_codes(); + if (is_entry_compatible_with_pattern(tag, line->pattern)) { + ptr_command = line->config_command->semantic_codes; while (ptr_command != nullptr) { - if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || - line->get_nb_required_tag() == - nb_delaf_tag_that_match_pattern(delaf_tags, - line->get_pattern())) { - if (u_strcmp(ptr_command->string, ConfigCommand::PLUS_COPY) == 0) { + if (line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED || + line->nb_required_tag == + nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { + if (u_strcmp(ptr_command->string, Multi2Delaf::PLUS_COPY) == 0) { for (int i = 1; i < tag->n_semantic_codes; i++) { // begin at 1 to skip the grammatical catergory codes = sorted_insert(tag->semantic_codes[i], codes); } } else { - if (line->get_nb_required_tag() != 0) { + if (line->nb_required_tag != 0) { codes = sorted_insert(ptr_command->string, codes); } } @@ -766,11 +753,10 @@ unichar* Multi2Delaf::retrieve_semantic_codes( ptr_command = ptr_command->next; } } else { - if (0 == line->get_nb_required_tag() && - 0 == nb_delaf_tag_that_match_pattern(delaf_tags, - line->get_pattern())) { + if (0 == line->nb_required_tag && + 0 == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { struct list_ustring* ptr_command = - line->get_config_command()->get_semantic_codes(); + line->config_command->semantic_codes; while (ptr_command != nullptr) { codes = sorted_insert(ptr_command->string, codes); ptr_command = ptr_command->next; @@ -814,8 +800,7 @@ struct list_ustring* Multi2Delaf::clone_and_replace_copy_command( struct list_ustring* res = nullptr; while (inflectional_command != nullptr) { - if (u_strcmp(inflectional_command->string, ConfigCommand::COLUMN_COPY) != - 0) { + if (u_strcmp(inflectional_command->string, Multi2Delaf::COLUMN_COPY) != 0) { res = sorted_insert(inflectional_command->string, res); } else { for (int i = 0; i < tag->n_inflectional_codes; i++) { @@ -870,17 +855,16 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( for (const auto& tag : delaf_tags) { for (const auto& line : _config_lines) { tmp_codes = codes; - if (line->get_config_command()->get_inflectional_codes() == nullptr) { + if (line->config_command->inflectional_codes == nullptr) { continue; } - if (is_entry_compatible_with_pattern(tag, line->get_pattern())) { - ptr_command = line->get_config_command()->get_inflectional_codes(); - - if (line->get_nb_required_tag() == ConfigLine::NOT_SPECIFIED || - line->get_nb_required_tag() == - nb_delaf_tag_that_match_pattern(delaf_tags, - line->get_pattern())) { - if (is_in_list(ConfigCommand::COLUMN_COPY, ptr_command)) { + if (is_entry_compatible_with_pattern(tag, line->pattern)) { + ptr_command = line->config_command->inflectional_codes; + + if (line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED || + line->nb_required_tag == + nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { + if (is_in_list(Multi2Delaf::COLUMN_COPY, ptr_command)) { struct list_ustring* tmp_lst = clone_and_replace_copy_command(ptr_command, tag); if (tmp_lst == nullptr) { @@ -901,11 +885,10 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( } } else { - if (0 == line->get_nb_required_tag() && - 0 == nb_delaf_tag_that_match_pattern(delaf_tags, - line->get_pattern())) { + if (0 == line->nb_required_tag && + 0 == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { struct list_ustring* ptr_command = - line->get_config_command()->get_inflectional_codes(); + line->config_command->inflectional_codes; codes = product(tmp_codes, ptr_command); if (tmp_codes) { free_list_ustring(tmp_codes); diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index f4bbc7a0..b4004a6a 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -29,10 +29,10 @@ #include #include -#include "base/compiler/keyword/eq_delete.h" #include "DELA.h" #include "Pattern.h" #include "Unicode.h" +#include "base/compiler/keyword/eq_delete.h" namespace unitex { @@ -41,46 +41,11 @@ namespace unitex { * A multidelaf string is an output of Explore graph paths made of a sequence of * Delaf tags. */ -class ConfigCommand { - public: - static constexpr const char* const COMMA_COPY = ",copy"; - static constexpr const char* const DOT_COPY = ".copy"; - static constexpr const char* const PLUS_COPY = "+copy"; - static constexpr const char* const COLUMN_COPY = ":copy"; - - static std::unique_ptr tokenize_config_command( - unichar* str, const char* config_filename); - - ConfigCommand(unichar* lemma, unichar* part_of_speech, - struct list_ustring* semantic_codes, - struct list_ustring* inflectional_codes); - // Not copyable or movable - ConfigCommand(const ConfigCommand&) UNITEX_EQ_DELETE; - ConfigCommand(ConfigCommand&&) UNITEX_EQ_DELETE; - ConfigCommand& operator=(const ConfigCommand& other) UNITEX_EQ_DELETE; - ConfigCommand& operator=(ConfigCommand&& other) UNITEX_EQ_DELETE; - ~ConfigCommand(); - unichar* get_lemma() const; - unichar* get_part_of_speech() const; - struct list_ustring* get_semantic_codes() const; - struct list_ustring* get_inflectional_codes() const; - - private: - static unichar* tokenize_lemma(unichar** ptr, const char* config_filename); - static unichar* tokenize_part_of_speech(unichar** ptr, - const char* config_filename); - static unichar* tokenize_one_semantic_code(unichar** ptr, - const char* config_filename); - static struct list_ustring* tokenize_semantic_codes( - unichar** ptr, const char* config_filename); - static unichar* tokenize_one_inflectional_code(unichar** ptr, - const char* config_filename); - static struct list_ustring* tokenize_inflectional_codes( - unichar* str, const char* config_filename); - unichar* _lemma; - unichar* _part_of_speech; - struct list_ustring* _semantic_codes; - struct list_ustring* _inflectional_codes; +struct ConfigCommand { + unichar* lemma; + unichar* part_of_speech; + struct list_ustring* semantic_codes; + struct list_ustring* inflectional_codes; }; /** @@ -88,35 +53,13 @@ class ConfigCommand { * A multidelaf string is an output of Explore graph paths made of a sequence of * Delaf tags. */ -class ConfigLine { - public: - static constexpr int NOT_SPECIFIED = -1; // nb_required_tag default value - - static std::unique_ptr tokenize_config_line( - unichar* line, const char* config_filename); - - ConfigLine(struct pattern* pattern, int nb_required_tag, - std::shared_ptr _config_command); - // Not copyable or movable - ConfigLine(const ConfigLine&) UNITEX_EQ_DELETE; - ConfigLine(ConfigLine&&) UNITEX_EQ_DELETE; - ConfigLine& operator=(const ConfigLine& other) UNITEX_EQ_DELETE; - ConfigLine& operator=(ConfigLine&& other) UNITEX_EQ_DELETE; - ~ConfigLine(); - static int advance_to_next_no_blank_char(unichar** str); - struct pattern* get_pattern() const; - int get_nb_required_tag() const; - std::shared_ptr get_config_command() const; - - private: - static int recognize_pattern_token(unichar** ptr, unichar* res); - static int tokenize_nb_required_tag(unichar** ptr, int* res); - struct pattern* _pattern; // lexical mask +struct ConfigLine { + struct pattern* pattern; // lexical mask // if _nb_required_tag equals to NOT_SPECIFIED, the number of delaf tags that must match the pattern is not specified // else if _nb_required_tag equals to 0, the pattern must not match any delaf tag // otherwise the numer of delaf tag that must match the pattern - const int _nb_required_tag; - const std::shared_ptr _config_command; + int nb_required_tag; + struct ConfigCommand* config_command; }; /** @@ -124,15 +67,22 @@ class ConfigLine { */ class Multi2Delaf { public: + static constexpr const char* const COMMA_COPY = ",copy"; + static constexpr const char* const DOT_COPY = ".copy"; + static constexpr const char* const PLUS_COPY = "+copy"; + static constexpr const char* const COLUMN_COPY = ":copy"; + static constexpr int NOT_SPECIFIED = -1; // nb_required_tag default value + Multi2Delaf(const char* config_filename); // Not copyable or movable - Multi2Delaf(const Multi2Delaf&) UNITEX_EQ_DELETE; - Multi2Delaf(Multi2Delaf&&) UNITEX_EQ_DELETE; + Multi2Delaf(const Multi2Delaf&) UNITEX_EQ_DELETE; + Multi2Delaf(Multi2Delaf&&) UNITEX_EQ_DELETE; Multi2Delaf& operator=(const Multi2Delaf& other) UNITEX_EQ_DELETE; - Multi2Delaf& operator=(Multi2Delaf&& other) UNITEX_EQ_DELETE; + Multi2Delaf& operator=(Multi2Delaf&& other) UNITEX_EQ_DELETE; void parse_config_file(); void translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const; + ~Multi2Delaf(); private: static int read_line_config_file(U_FILE* config_file, unichar* buffer, @@ -162,7 +112,7 @@ class Multi2Delaf { static unichar* build_output_codes(const struct list_ustring* list, char separator); const VersatileEncodingConfig _vec = VEC_DEFAULT; - std::vector> _config_lines; + std::vector _config_lines; const char* _config_filename; }; From 0ab7f27bb9ebb0f166f9ad945ccb034845343797 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 19:01:53 +0200 Subject: [PATCH 10/24] Delete the use of standard libraries (memory, utility, algorithm) --- src/Multi2Delaf.cpp | 25 +++++++++++++------------ src/Multi2Delaf.h | 1 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 82ed6d4f..e454f954 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -23,9 +23,6 @@ #include -#include -#include -#include #include #include "DELA.h" @@ -363,7 +360,7 @@ struct ConfigLine* new_config_line(struct pattern* pattern, int nb_required_tag, /** * Tokenize a config line. - * Returns a std::unique_ptr if there is a well-formed line. + * Returns a struct ConfigLine* if there is a well-formed line. * otherwise returns nullptr. * Raises a fatal error in case of malformed line. */ @@ -542,17 +539,17 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { int eof = 0; while (EOF != (eof = read_line_config_file(config_file, line, INPUTSIZEBUFFER))) { - auto config_line = + struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { - _config_lines.emplace_back(std::move(config_line)); + _config_lines.emplace_back(config_line); } } // the last line is potentially a config line auto config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { - _config_lines.emplace_back(std::move(config_line)); + _config_lines.emplace_back(config_line); } } @@ -593,10 +590,13 @@ struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { int Multi2Delaf::nb_delaf_tag_that_match_pattern( const std::vector& delaf_tags, const struct pattern* pattern) { - return std::count_if(delaf_tags.begin(), delaf_tags.end(), - [&pattern](const auto& tag) { - return is_entry_compatible_with_pattern(tag, pattern); - }); + int res = 0; + for (auto& tag : delaf_tags) { + if (is_entry_compatible_with_pattern(tag, pattern)) { + res++; + } + } + return res; } /** @@ -620,7 +620,8 @@ unichar* Multi2Delaf::escape_inflected_input(const unichar* input) { i_buffer++; i_input++; } - buffer[std::min(i_buffer, INPUTSIZEBUFFER - 1)] = '\0'; + buffer[i_buffer < INPUTSIZEBUFFER - 1 ? i_buffer : INPUTSIZEBUFFER - 1] = + '\0'; return u_strdup(buffer); } diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index b4004a6a..300e288f 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -26,7 +26,6 @@ #define HAS_UNITEX_NAMESPACE 1 #endif -#include #include #include "DELA.h" From 05517db5e7850c159ef52fe3c82b9cd5b1149fe7 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 20:12:20 +0200 Subject: [PATCH 11/24] Change vector into struct list_pointer* --- src/Multi2Delaf.cpp | 48 ++++++++++++++++++++++++++++++++------------- src/Multi2Delaf.h | 4 +++- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index e454f954..89495b8f 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -418,7 +418,8 @@ struct ConfigLine* tokenize_config_line(unichar* line, return new_config_line(pattern, nb_required_tag, config_command); } -void free_config_line(struct ConfigLine* line) { +void free_config_line(void* void_line) { + struct ConfigLine* line = (struct ConfigLine*)void_line; free_pattern(line->pattern); free(line); free_config_command(line->config_command); @@ -435,9 +436,7 @@ Multi2Delaf::Multi2Delaf(const char* config_filename) } Multi2Delaf::~Multi2Delaf() { - for (auto& lines : _config_lines) { - free_config_line(lines); - } + free_list_pointer(_config_lines, free_config_line); } /** @@ -542,14 +541,14 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { - _config_lines.emplace_back(config_line); + _config_lines = new_list_pointer(config_line, _config_lines); } } // the last line is potentially a config line - auto config_line = + struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != nullptr) { - _config_lines.emplace_back(config_line); + _config_lines = new_list_pointer(config_line, _config_lines); } } @@ -634,8 +633,13 @@ unichar* Multi2Delaf::escape_inflected_input(const unichar* input) { unichar* Multi2Delaf::retrieve_lemma( const std::vector& delaf_tags, const unichar* multidelaf_string) const { - for (const auto& current_line : _config_lines) { + struct list_pointer* current_line_ptr = _config_lines; + + while (current_line_ptr != NULL) { + struct ConfigLine* current_line = + (struct ConfigLine*)current_line_ptr->pointer; if (current_line->config_command->lemma == nullptr) { + current_line_ptr = current_line_ptr->next; continue; } for (const auto& tag : delaf_tags) { @@ -665,6 +669,7 @@ unichar* Multi2Delaf::retrieve_lemma( if (current_line->nb_required_tag == 0) { return u_strdup(current_line->config_command->lemma); } + current_line_ptr = current_line_ptr->next; } fatal_error("No lemma is provided for this multidelaf string: %S\n", multidelaf_string); @@ -680,8 +685,12 @@ unichar* Multi2Delaf::retrieve_lemma( unichar* Multi2Delaf::retrieve_part_of_speech( const std::vector& delaf_tags, const unichar* multidelaf_string) const { - for (const auto& current_line : _config_lines) { + struct list_pointer* current_line_ptr = _config_lines; + while (current_line_ptr != NULL) { + struct ConfigLine* current_line = + (struct ConfigLine*)current_line_ptr->pointer; if (current_line->config_command->part_of_speech == nullptr) { + current_line_ptr = current_line_ptr->next; continue; } for (const auto& tag : delaf_tags) { @@ -710,6 +719,7 @@ unichar* Multi2Delaf::retrieve_part_of_speech( if (current_line->nb_required_tag == 0) { return u_strdup(current_line->config_command->part_of_speech); } + current_line_ptr = current_line_ptr->next; } fatal_error( "No grammatical cathegory is provided for this multidelaf string: %S\n", @@ -726,12 +736,16 @@ unichar* Multi2Delaf::retrieve_part_of_speech( */ unichar* Multi2Delaf::retrieve_semantic_codes( const std::vector& delaf_tags) const { - struct list_ustring* codes = nullptr; - struct list_ustring* ptr_command = nullptr; + struct list_ustring* codes = nullptr; + struct list_ustring* ptr_command = nullptr; + struct list_pointer* config_lines_ptr = _config_lines; for (const auto& tag : delaf_tags) { - for (const auto& line : _config_lines) { + config_lines_ptr = _config_lines; + while (config_lines_ptr != NULL) { + struct ConfigLine* line = (struct ConfigLine*)config_lines_ptr->pointer; if (line->config_command->semantic_codes == nullptr) { + config_lines_ptr = config_lines_ptr->next; continue; } if (is_entry_compatible_with_pattern(tag, line->pattern)) { @@ -764,6 +778,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( } } } + config_lines_ptr = config_lines_ptr->next; } } unichar* res = build_output_codes(codes, '+'); @@ -852,11 +867,15 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( struct list_ustring* codes = nullptr; struct list_ustring* tmp_codes = nullptr; struct list_ustring* ptr_command = nullptr; + struct list_pointer* line_ptr = _config_lines; for (const auto& tag : delaf_tags) { - for (const auto& line : _config_lines) { - tmp_codes = codes; + line_ptr = _config_lines; + while (line_ptr != NULL) { + struct ConfigLine* line = (struct ConfigLine*)line_ptr->pointer; + tmp_codes = codes; if (line->config_command->inflectional_codes == nullptr) { + line_ptr = line_ptr->next; continue; } if (is_entry_compatible_with_pattern(tag, line->pattern)) { @@ -896,6 +915,7 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( } } } + line_ptr = line_ptr->next; } } unichar* res = build_output_codes(codes, ':'); diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 300e288f..2c2c7a50 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -29,6 +29,7 @@ #include #include "DELA.h" +#include "List_pointer.h" #include "Pattern.h" #include "Unicode.h" #include "base/compiler/keyword/eq_delete.h" @@ -111,7 +112,8 @@ class Multi2Delaf { static unichar* build_output_codes(const struct list_ustring* list, char separator); const VersatileEncodingConfig _vec = VEC_DEFAULT; - std::vector _config_lines; + //std::vector _config_lines; + struct list_pointer* _config_lines = nullptr; const char* _config_filename; }; From ea213605619388c362b4b9f2c39c19d289025c0b Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 20:33:11 +0200 Subject: [PATCH 12/24] Replace static methods in Multi2Delaf class by functions --- src/Multi2Delaf.cpp | 396 ++++++++++++++++++++++---------------------- src/Multi2Delaf.h | 18 +- 2 files changed, 197 insertions(+), 217 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 89495b8f..4e3fdda5 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -23,8 +23,6 @@ #include -#include - #include "DELA.h" #include "File.h" #include "Pattern.h" @@ -275,8 +273,6 @@ void free_config_command(struct ConfigCommand* command) { free(command); } - - /*================================================================= * ConfigLine class method *================================================================= */ @@ -425,7 +421,202 @@ void free_config_line(void* void_line) { free_config_command(line->config_command); } +/** + * Return the first delaf tag at the address *ptr. + * Return nullptr to indicate that there is no more tag. + * Set *ptr to the next unread character. + * Raises a fatal error if the delaf tag is not enclosed in curly braces. + */ +struct dela_entry* tokenize_delaf_tag(unichar** ptr) { + unichar* line = *ptr; + unichar* next_no_blank_char = line; + if (advance_to_next_no_blank_char(&next_no_blank_char)) { + return nullptr; // end of the line, no more dela_entry + } + if (next_no_blank_char[0] != '{') { + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", + line); + } + int i = 1; + while (next_no_blank_char[i] != '\0' && next_no_blank_char[i] != '}') { + i++; + } + if (next_no_blank_char[i] == '\0') { + fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", + line); + } + unichar* token_dela_entry = u_strndup(next_no_blank_char + 1, i - 1); + struct dela_entry* tag = tokenize_DELAF_line(token_dela_entry); + free(token_dela_entry); + *ptr = next_no_blank_char + i + 1; // + 1 to skip closing brace '}' + return tag; +} + +/** + * Read a line from the configuration file, and save the result in buffer. + * Returns EOF if end of file occurs, otherwise the number of unichar readed. + */ +int read_line_config_file(U_FILE* config_file, unichar* buffer, + int size_buffer) { + int c = 0; + int i = 0; + while (i < size_buffer - 1 && EOF != (c = u_fgetc(config_file))) { + if (c == '\r') { + if ('\n' == (c = u_fgetc(config_file))) { + buffer[i] = '\0'; + return i; + } + unichar rest_of_line[INPUTSIZEBUFFER] = {0}; + int j = 0; + while (j < INPUTSIZEBUFFER - 1 && EOF != (c = u_fgetc(config_file))) { + if (c != '\n') { + rest_of_line[j] = c; + j++; + } else { + rest_of_line[j] = '\0'; + u_fprintf(U_STDERR, "\\r not followed by a \\n, '%S' is ignored\n", + rest_of_line); + return i + j; + } + } + } + if (c == '\n') { + buffer[i] = '\0'; + return i; + } + buffer[i] = c; + i++; + } + buffer[i] = '\0'; + return EOF; +} + +/** + * Return the number of delaf tags that match the pattern. + */ +int nb_delaf_tag_that_match_pattern( + const std::vector& delaf_tags, + const struct pattern* pattern) { + int res = 0; + for (auto& tag : delaf_tags) { + if (is_entry_compatible_with_pattern(tag, pattern)) { + res++; + } + } + return res; +} + +/** + * Return a new allocated unicode string. + */ +unichar* escape_inflected_input(const unichar* input) { + unichar buffer[INPUTSIZEBUFFER] = {0}; + int i_input = 0; + int i_buffer = 0; + while (i_input < INPUTSIZEBUFFER - 1 && i_buffer < INPUTSIZEBUFFER - 1) { + if (input[i_input] == ' ' && input[i_input + 1] == '\0') { + buffer[i_buffer] = '\0'; + return u_strdup(buffer); + } + if (input[i_input] == '=' || input[i_input] == '.' || + input[i_input] == ',') { + buffer[i_buffer] = '\\'; + i_buffer++; + } + buffer[i_buffer] = input[i_input]; + i_buffer++; + i_input++; + } + buffer[i_buffer < INPUTSIZEBUFFER - 1 ? i_buffer : INPUTSIZEBUFFER - 1] = + '\0'; + return u_strdup(buffer); +} + +/** + * Returns a new string containing the first one and the second one without duplicates. + */ +unichar* complete_first_with_second(const unichar* first, + const unichar* second) { + unichar to_add[INPUTSIZEBUFFER] = {0}; + unichar res[INPUTSIZEBUFFER] = {0}; + int j = 0; + for (size_t i = 0; i < u_strlen(second); i++) { + if (u_strchr(first, second[i]) == nullptr) { + to_add[j] = second[i]; + j++; + } + } + to_add[j] = '\0'; + u_sprintf(res, "%S%S", first, to_add); + return u_strdup(res); +} +/** + * Returns a new list where ::copy has been replaced by the inflectional codes of the tag. + */ +struct list_ustring* clone_and_replace_copy_command( + const struct list_ustring* inflectional_command, + const struct dela_entry* tag) { + struct list_ustring* res = nullptr; + + while (inflectional_command != nullptr) { + if (u_strcmp(inflectional_command->string, Multi2Delaf::COLUMN_COPY) != 0) { + res = sorted_insert(inflectional_command->string, res); + } else { + for (int i = 0; i < tag->n_inflectional_codes; i++) { + res = sorted_insert(tag->inflectional_codes[i], res); + } + } + inflectional_command = inflectional_command->next; + } + return res; +} + +/** + * Create a new allocated list containing the Cartesian product of the two lists in parameter + * and substituate ::copy command by codes in the delaf tag. + * Suppose that l2 is not the empty list. + */ +struct list_ustring* product(struct list_ustring* l1, struct list_ustring* l2) { + unichar* tmp_code = nullptr; + struct list_ustring* res = nullptr; + struct list_ustring* ptr_l1 = l1; + struct list_ustring* ptr_l2 = l2; + if (l1 == nullptr) { + return clone(l2); + } + while (ptr_l1 != nullptr) { + ptr_l2 = l2; + while (ptr_l2 != nullptr) { + tmp_code = complete_first_with_second(ptr_l1->string, ptr_l2->string); + res = sorted_insert(tmp_code, res); + free(tmp_code); + ptr_l2 = ptr_l2->next; + } + ptr_l1 = ptr_l1->next; + } + return res; +} + +/** + * Return a new allocated unicode string. + */ +unichar* build_output_codes(const struct list_ustring* codes, char prefix) { + unichar buffer[INPUTSIZEBUFFER] = {0}; + if (codes == nullptr) { + return u_strdup(""); + } + while (codes != nullptr) { + if (u_strlen(buffer) + u_strlen(codes->string) + 2 >= INPUTSIZEBUFFER - 1) { + fatal_error( + "internal err(build_output_codes): buffer is not " + "big enough\n"); + } + u_sprintf(buffer, "%S%c%S", buffer, prefix, codes->string); + codes = codes->next; + } + return u_strdup(buffer); +} /*================================================================= * Multi2Delaf class method @@ -490,44 +681,7 @@ void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, } } -/** - * Read a line from the configuration file, and save the result in buffer. - * Returns EOF if end of file occurs, otherwise the number of unichar readed. - */ -int Multi2Delaf::read_line_config_file(U_FILE* config_file, unichar* buffer, - int size_buffer) { - int c = 0; - int i = 0; - while (i < size_buffer - 1 && EOF != (c = u_fgetc(config_file))) { - if (c == '\r') { - if ('\n' == (c = u_fgetc(config_file))) { - buffer[i] = '\0'; - return i; - } - unichar rest_of_line[INPUTSIZEBUFFER] = {0}; - int j = 0; - while (j < INPUTSIZEBUFFER - 1 && EOF != (c = u_fgetc(config_file))) { - if (c != '\n') { - rest_of_line[j] = c; - j++; - } else { - rest_of_line[j] = '\0'; - u_fprintf(U_STDERR, "\\r not followed by a \\n, '%S' is ignored\n", - rest_of_line); - return i + j; - } - } - } - if (c == '\n') { - buffer[i] = '\0'; - return i; - } - buffer[i] = c; - i++; - } - buffer[i] = '\0'; - return EOF; -} + /** * Load the configuration file. @@ -552,78 +706,6 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { } } -/** - * Return the first delaf tag at the address *ptr. - * Return nullptr to indicate that there is no more tag. - * Set *ptr to the next unread character. - * Raises a fatal error if the delaf tag is not enclosed in curly braces. - */ -struct dela_entry* Multi2Delaf::tokenize_delaf_tag(unichar** ptr) { - unichar* line = *ptr; - unichar* next_no_blank_char = line; - if (advance_to_next_no_blank_char(&next_no_blank_char)) { - return nullptr; // end of the line, no more dela_entry - } - if (next_no_blank_char[0] != '{') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", - line); - } - int i = 1; - while (next_no_blank_char[i] != '\0' && next_no_blank_char[i] != '}') { - i++; - } - if (next_no_blank_char[i] == '\0') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", - line); - } - unichar* token_dela_entry = u_strndup(next_no_blank_char + 1, i - 1); - struct dela_entry* tag = tokenize_DELAF_line(token_dela_entry); - free(token_dela_entry); - *ptr = next_no_blank_char + i + 1; // + 1 to skip closing brace '}' - return tag; -} - -/** - * Return the number of delaf tags that match the pattern. - */ -int Multi2Delaf::nb_delaf_tag_that_match_pattern( - const std::vector& delaf_tags, - const struct pattern* pattern) { - int res = 0; - for (auto& tag : delaf_tags) { - if (is_entry_compatible_with_pattern(tag, pattern)) { - res++; - } - } - return res; -} - -/** - * Return a new allocated unicode string. - */ -unichar* Multi2Delaf::escape_inflected_input(const unichar* input) { - unichar buffer[INPUTSIZEBUFFER] = {0}; - int i_input = 0; - int i_buffer = 0; - while (i_input < INPUTSIZEBUFFER - 1 && i_buffer < INPUTSIZEBUFFER - 1) { - if (input[i_input] == ' ' && input[i_input + 1] == '\0') { - buffer[i_buffer] = '\0'; - return u_strdup(buffer); - } - if (input[i_input] == '=' || input[i_input] == '.' || - input[i_input] == ',') { - buffer[i_buffer] = '\\'; - i_buffer++; - } - buffer[i_buffer] = input[i_input]; - i_buffer++; - i_input++; - } - buffer[i_buffer < INPUTSIZEBUFFER - 1 ? i_buffer : INPUTSIZEBUFFER - 1] = - '\0'; - return u_strdup(buffer); -} - /** * Retrieve the lemma according to the specification: * We look every line of _config_lines and keep the lemma of the first-one matching containing a lemma. @@ -788,72 +870,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( return res; } -/** - * Returns a new string containing the first one and the second one without duplicates. - */ -unichar* Multi2Delaf::complete_first_with_second(const unichar* first, - const unichar* second) { - unichar to_add[INPUTSIZEBUFFER] = {0}; - unichar res[INPUTSIZEBUFFER] = {0}; - int j = 0; - for (size_t i = 0; i < u_strlen(second); i++) { - if (u_strchr(first, second[i]) == nullptr) { - to_add[j] = second[i]; - j++; - } - } - to_add[j] = '\0'; - u_sprintf(res, "%S%S", first, to_add); - return u_strdup(res); -} - -/** - * Returns a new list where ::copy has been replaced by the inflectional codes of the tag. - */ -struct list_ustring* Multi2Delaf::clone_and_replace_copy_command( - const struct list_ustring* inflectional_command, - const struct dela_entry* tag) { - struct list_ustring* res = nullptr; - - while (inflectional_command != nullptr) { - if (u_strcmp(inflectional_command->string, Multi2Delaf::COLUMN_COPY) != 0) { - res = sorted_insert(inflectional_command->string, res); - } else { - for (int i = 0; i < tag->n_inflectional_codes; i++) { - res = sorted_insert(tag->inflectional_codes[i], res); - } - } - inflectional_command = inflectional_command->next; - } - return res; -} -/** - * Create a new allocated list containing the Cartesian product of the two lists in parameter - * and substituate ::copy command by codes in the delaf tag. - * Suppose that l2 is not the empty list. - */ -struct list_ustring* Multi2Delaf::product(struct list_ustring* l1, - struct list_ustring* l2) { - unichar* tmp_code = nullptr; - struct list_ustring* res = nullptr; - struct list_ustring* ptr_l1 = l1; - struct list_ustring* ptr_l2 = l2; - if (l1 == nullptr) { - return clone(l2); - } - while (ptr_l1 != nullptr) { - ptr_l2 = l2; - while (ptr_l2 != nullptr) { - tmp_code = complete_first_with_second(ptr_l1->string, ptr_l2->string); - res = sorted_insert(tmp_code, res); - free(tmp_code); - ptr_l2 = ptr_l2->next; - } - ptr_l1 = ptr_l1->next; - } - return res; -} /** * Retrieves inflectional codes according to the specification: @@ -925,25 +942,4 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( return res; } -/** - * Return a new allocated unicode string. - */ -unichar* Multi2Delaf::build_output_codes(const struct list_ustring* codes, - char prefix) { - unichar buffer[INPUTSIZEBUFFER] = {0}; - if (codes == nullptr) { - return u_strdup(""); - } - while (codes != nullptr) { - if (u_strlen(buffer) + u_strlen(codes->string) + 2 >= INPUTSIZEBUFFER - 1) { - fatal_error( - "internal err(Multi2Delaf::build_output_codes): buffer is not " - "big enough\n"); - } - u_sprintf(buffer, "%S%c%S", buffer, prefix, codes->string); - codes = codes->next; - } - return u_strdup(buffer); -} - } // namespace unitex diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 2c2c7a50..bd2686de 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -85,14 +85,7 @@ class Multi2Delaf { ~Multi2Delaf(); private: - static int read_line_config_file(U_FILE* config_file, unichar* buffer, - int size_buffer); void load_config_file(U_FILE* config_file); - static struct dela_entry* tokenize_delaf_tag(unichar** next); - static int nb_delaf_tag_that_match_pattern( - const std::vector& delaf_tags, - const struct pattern* pattern); - static unichar* escape_inflected_input(const unichar* input); unichar* retrieve_lemma(const std::vector& delaf_tags, const unichar* multidelaf_string) const; unichar* retrieve_part_of_speech( @@ -100,19 +93,10 @@ class Multi2Delaf { const unichar* multidelaf_string) const; unichar* retrieve_semantic_codes( const std::vector& delaf_tags) const; - static unichar* complete_first_with_second(const unichar* first, - const unichar* second); - static struct list_ustring* clone_and_replace_copy_command( - const struct list_ustring* inflectional_command, - const struct dela_entry* tag); - static struct list_ustring* product(struct list_ustring* l1, - struct list_ustring* l2); unichar* retrieve_inflectional_codes( const std::vector& delaf_tags) const; - static unichar* build_output_codes(const struct list_ustring* list, - char separator); + const VersatileEncodingConfig _vec = VEC_DEFAULT; - //std::vector _config_lines; struct list_pointer* _config_lines = nullptr; const char* _config_filename; }; From 39ba8f97551b164ec437f81ed8d445b78348ff48 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Sun, 31 Jul 2022 21:39:30 +0200 Subject: [PATCH 13/24] Change vector into struct list_pointer* --- src/Multi2Delaf.cpp | 75 +++++++++++++++++++++++++++++---------------- src/Multi2Delaf.h | 14 +++------ 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 4e3fdda5..ce5c440c 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -494,14 +494,15 @@ int read_line_config_file(U_FILE* config_file, unichar* buffer, /** * Return the number of delaf tags that match the pattern. */ -int nb_delaf_tag_that_match_pattern( - const std::vector& delaf_tags, - const struct pattern* pattern) { +int nb_delaf_tag_that_match_pattern(struct list_pointer* delaf_tags, + const struct pattern* pattern) { int res = 0; - for (auto& tag : delaf_tags) { + while (delaf_tags != NULL) { + struct dela_entry* tag = (struct dela_entry*)delaf_tags->pointer; if (is_entry_compatible_with_pattern(tag, pattern)) { res++; } + delaf_tags = delaf_tags->next; } return res; } @@ -651,11 +652,11 @@ void Multi2Delaf::parse_config_file() { */ void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const { - unichar* ptr = buffer; - auto delaf_tags = std::vector(); - struct dela_entry* new_tag = nullptr; + unichar* ptr = buffer; + struct list_pointer* delaf_tags = NULL; + struct dela_entry* new_tag = nullptr; while (nullptr != (new_tag = tokenize_delaf_tag(&ptr))) { - delaf_tags.push_back(new_tag); + delaf_tags = new_list_pointer(new_tag, delaf_tags); } unichar* inflected = escape_inflected_input(inflected_input); unichar* lemma = retrieve_lemma(delaf_tags, buffer); @@ -676,9 +677,12 @@ void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, free(part_of_speech); free(semantic_codes); free(inflectional_codes); - for (const auto& tag : delaf_tags) { - free_dela_entry(tag); + struct list_pointer* delaf_tags_ptr = delaf_tags; + while (delaf_tags_ptr != NULL) { + free_dela_entry((struct dela_entry*)delaf_tags_ptr->pointer); + delaf_tags_ptr = delaf_tags_ptr->next; } + free_list_pointer(delaf_tags); } @@ -712,10 +716,11 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { * If no lemma is corresponding, raises a fatal_error(). * Return a new allocated unicode string describing the lemma. */ -unichar* Multi2Delaf::retrieve_lemma( - const std::vector& delaf_tags, - const unichar* multidelaf_string) const { +unichar* Multi2Delaf::retrieve_lemma(struct list_pointer* delaf_tags, + const unichar* multidelaf_string) const { struct list_pointer* current_line_ptr = _config_lines; + struct list_pointer* delaf_tags_ptr = delaf_tags; + struct dela_entry* tag = NULL; while (current_line_ptr != NULL) { struct ConfigLine* current_line = @@ -724,7 +729,9 @@ unichar* Multi2Delaf::retrieve_lemma( current_line_ptr = current_line_ptr->next; continue; } - for (const auto& tag : delaf_tags) { + delaf_tags_ptr = delaf_tags; + while (delaf_tags_ptr != NULL) { + tag = (struct dela_entry*)delaf_tags_ptr->pointer; if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { if (u_strcmp(current_line->config_command->lemma, Multi2Delaf::COMMA_COPY) == 0) { @@ -747,6 +754,7 @@ unichar* Multi2Delaf::retrieve_lemma( return u_strdup(current_line->config_command->lemma); } } + delaf_tags_ptr = delaf_tags_ptr->next; } if (current_line->nb_required_tag == 0) { return u_strdup(current_line->config_command->lemma); @@ -765,9 +773,10 @@ unichar* Multi2Delaf::retrieve_lemma( * Return a new allocated unicode string describing the part of speech. */ unichar* Multi2Delaf::retrieve_part_of_speech( - const std::vector& delaf_tags, - const unichar* multidelaf_string) const { + struct list_pointer* delaf_tags, const unichar* multidelaf_string) const { struct list_pointer* current_line_ptr = _config_lines; + struct list_pointer* delaf_tag_ptr = delaf_tags; + struct dela_entry* tag = NULL; while (current_line_ptr != NULL) { struct ConfigLine* current_line = (struct ConfigLine*)current_line_ptr->pointer; @@ -775,7 +784,10 @@ unichar* Multi2Delaf::retrieve_part_of_speech( current_line_ptr = current_line_ptr->next; continue; } - for (const auto& tag : delaf_tags) { + delaf_tag_ptr = delaf_tags; + while (delaf_tag_ptr != NULL) { + // for (const auto& tag : delaf_tags) { + tag = (struct dela_entry*)delaf_tag_ptr->pointer; if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { if (u_strcmp(current_line->config_command->part_of_speech, Multi2Delaf::DOT_COPY) == 0) { @@ -797,6 +809,7 @@ unichar* Multi2Delaf::retrieve_part_of_speech( return u_strdup(current_line->config_command->part_of_speech); } } + delaf_tag_ptr = delaf_tag_ptr->next; } if (current_line->nb_required_tag == 0) { return u_strdup(current_line->config_command->part_of_speech); @@ -817,12 +830,16 @@ unichar* Multi2Delaf::retrieve_part_of_speech( * Return a new allocated unicode string describing semantic codes. */ unichar* Multi2Delaf::retrieve_semantic_codes( - const std::vector& delaf_tags) const { + struct list_pointer* delaf_tags) const { struct list_ustring* codes = nullptr; struct list_ustring* ptr_command = nullptr; struct list_pointer* config_lines_ptr = _config_lines; + struct list_pointer* delaf_tag_ptr = delaf_tags; + struct dela_entry* tag = NULL; - for (const auto& tag : delaf_tags) { + while (delaf_tag_ptr != NULL) { + // for (const auto& tag : delaf_tags) { + tag = (struct dela_entry*)delaf_tag_ptr->pointer; config_lines_ptr = _config_lines; while (config_lines_ptr != NULL) { struct ConfigLine* line = (struct ConfigLine*)config_lines_ptr->pointer; @@ -862,6 +879,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( } config_lines_ptr = config_lines_ptr->next; } + delaf_tag_ptr = delaf_tag_ptr->next; } unichar* res = build_output_codes(codes, '+'); if (codes) { @@ -880,13 +898,17 @@ unichar* Multi2Delaf::retrieve_semantic_codes( * Return a new allocated unicode string describing inflectional codes. */ unichar* Multi2Delaf::retrieve_inflectional_codes( - const std::vector& delaf_tags) const { - struct list_ustring* codes = nullptr; - struct list_ustring* tmp_codes = nullptr; - struct list_ustring* ptr_command = nullptr; - struct list_pointer* line_ptr = _config_lines; - - for (const auto& tag : delaf_tags) { + struct list_pointer* delaf_tags) const { + struct list_ustring* codes = nullptr; + struct list_ustring* tmp_codes = nullptr; + struct list_ustring* ptr_command = nullptr; + struct list_pointer* line_ptr = _config_lines; + struct list_pointer* delaf_tag_ptr = delaf_tags; + struct dela_entry* tag = NULL; + + while (delaf_tag_ptr != NULL) { + // for (const auto& tag : delaf_tags) { + tag = (struct dela_entry*)delaf_tag_ptr->pointer; line_ptr = _config_lines; while (line_ptr != NULL) { struct ConfigLine* line = (struct ConfigLine*)line_ptr->pointer; @@ -934,6 +956,7 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( } line_ptr = line_ptr->next; } + delaf_tag_ptr = delaf_tag_ptr->next; } unichar* res = build_output_codes(codes, ':'); if (codes) { diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index bd2686de..fa0e00dd 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -26,7 +26,6 @@ #define HAS_UNITEX_NAMESPACE 1 #endif -#include #include "DELA.h" #include "List_pointer.h" @@ -86,15 +85,12 @@ class Multi2Delaf { private: void load_config_file(U_FILE* config_file); - unichar* retrieve_lemma(const std::vector& delaf_tags, + unichar* retrieve_lemma(struct list_pointer* delaf_tags, const unichar* multidelaf_string) const; - unichar* retrieve_part_of_speech( - const std::vector& delaf_tags, - const unichar* multidelaf_string) const; - unichar* retrieve_semantic_codes( - const std::vector& delaf_tags) const; - unichar* retrieve_inflectional_codes( - const std::vector& delaf_tags) const; + unichar* retrieve_part_of_speech(struct list_pointer* delaf_tags, + const unichar* multidelaf_string) const; + unichar* retrieve_semantic_codes(struct list_pointer* delaf_tags) const; + unichar* retrieve_inflectional_codes(struct list_pointer* delaf_tags) const; const VersatileEncodingConfig _vec = VEC_DEFAULT; struct list_pointer* _config_lines = nullptr; From a5224d3194fcedbebfbfe49d5ba2b907f2859ecd Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Mon, 1 Aug 2022 11:12:41 +0200 Subject: [PATCH 14/24] Replace nullptr by NULL --- src/Fst2List.cpp | 4 +- src/Multi2Delaf.cpp | 96 ++++++++++++++++++++++----------------------- src/Multi2Delaf.h | 2 +- 3 files changed, 49 insertions(+), 53 deletions(-) diff --git a/src/Fst2List.cpp b/src/Fst2List.cpp index 98ca4a46..84320aa8 100644 --- a/src/Fst2List.cpp +++ b/src/Fst2List.cpp @@ -300,7 +300,7 @@ class CFstApp { struct hash_table* path_to_stop; /* a hash table to know all the Fst2Tag whose path exploration must be interrupted */ struct hash_table* dela_entries; /* a hash table to get the dela_entries of created boxes when lexical masks are processed */ bool compileToDelaf = false; - Multi2Delaf *multi2Delaf = nullptr; + Multi2Delaf *multi2Delaf = NULL; bool isMorphological = false; bool makeDic = false; @@ -408,7 +408,7 @@ class CFstApp { free(numOfIgnore); } deleteCallIdMap(); - if (multi2Delaf != nullptr) { + if (multi2Delaf != NULL) { delete multi2Delaf; } } diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index ce5c440c..bf5274c2 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -150,7 +150,7 @@ struct list_ustring* tokenize_semantic_codes(unichar** ptr, const char* config_filename) { unichar* line = *ptr; unichar* next_code = line; - struct list_ustring* codes = nullptr; + struct list_ustring* codes = NULL; while (*next_code != '\0' && *next_code != ':') { if (*next_code == '+') { unichar* new_code = @@ -198,7 +198,7 @@ unichar* tokenize_one_inflectional_code(unichar** ptr, */ struct list_ustring* tokenize_inflectional_codes(unichar* str, const char* config_filename) { - struct list_ustring* codes = nullptr; + struct list_ustring* codes = NULL; unichar* next_code = str; while (*next_code != '\0') { if (*next_code == ':') { @@ -236,10 +236,10 @@ struct ConfigCommand* new_config_command( */ struct ConfigCommand* tokenize_config_command(unichar* str, const char* config_filename) { - unichar* lemma = nullptr; - unichar* part_of_speech = nullptr; - struct list_ustring* semantic_codes = nullptr; - struct list_ustring* inflectional_codes = nullptr; + unichar* lemma = NULL; + unichar* part_of_speech = NULL; + struct list_ustring* semantic_codes = NULL; + struct list_ustring* inflectional_codes = NULL; unichar* ptr = str; if (str[0] == ',') { lemma = tokenize_lemma(&ptr, config_filename); @@ -273,10 +273,6 @@ void free_config_command(struct ConfigCommand* command) { free(command); } -/*================================================================= - * ConfigLine class method - *================================================================= */ - /** * Return 1 if error occurs, otherwise 0. * Assumes that pattern is suround by < >. @@ -334,7 +330,7 @@ int tokenize_nb_required_tag(unichar** ptr, int* res) { return 1; } res[index_buffer] = '\0'; - *res = u_parse_int(buffer, nullptr); + *res = u_parse_int(buffer, NULL); *ptr = line + index + 1; return 0; } @@ -357,7 +353,7 @@ struct ConfigLine* new_config_line(struct pattern* pattern, int nb_required_tag, /** * Tokenize a config line. * Returns a struct ConfigLine* if there is a well-formed line. - * otherwise returns nullptr. + * otherwise returns NULL. * Raises a fatal error in case of malformed line. */ struct ConfigLine* tokenize_config_line(unichar* line, @@ -365,10 +361,10 @@ struct ConfigLine* tokenize_config_line(unichar* line, unichar* nextNoEmptyUnichar = line; unichar patternToken[INPUTSIZEBUFFER] = {0}; if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { - return nullptr; // skip empty line + return NULL; // skip empty line } if (nextNoEmptyUnichar[0] == '#') { - return nullptr; // skip comment line + return NULL; // skip comment line } if (recognize_pattern_token(&nextNoEmptyUnichar, patternToken)) { fatal_error( @@ -377,10 +373,10 @@ struct ConfigLine* tokenize_config_line(unichar* line, config_filename, line); } // build pattern - struct pattern* pattern = build_pattern(patternToken, nullptr, 0, nullptr); + struct pattern* pattern = build_pattern(patternToken, NULL, 0, NULL); if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { // if there is no command, we skip the current line like comment line - return nullptr; + return NULL; } // build nb_required_tag int nb_required_tag = Multi2Delaf::NOT_SPECIFIED; @@ -392,7 +388,7 @@ struct ConfigLine* tokenize_config_line(unichar* line, } if (advance_to_next_no_blank_char(&nextNoEmptyUnichar)) { // if there is no command, we skip the current line like comment line - return nullptr; + return NULL; } // build config command struct ConfigCommand* config_command = @@ -408,7 +404,7 @@ struct ConfigLine* tokenize_config_line(unichar* line, nb_required_tag != Multi2Delaf::NOT_SPECIFIED && nb_required_tag != 1) { fatal_error( "Command ..copy is incompatible with an integer enclosed in curly " - "braces, except for {1}in file:%s, line: '%S'\n", + "braces, except for {1} in file:%s, line: '%S'\n", config_filename, line); } return new_config_line(pattern, nb_required_tag, config_command); @@ -423,7 +419,7 @@ void free_config_line(void* void_line) { /** * Return the first delaf tag at the address *ptr. - * Return nullptr to indicate that there is no more tag. + * Return NULL to indicate that there is no more tag. * Set *ptr to the next unread character. * Raises a fatal error if the delaf tag is not enclosed in curly braces. */ @@ -431,7 +427,7 @@ struct dela_entry* tokenize_delaf_tag(unichar** ptr) { unichar* line = *ptr; unichar* next_no_blank_char = line; if (advance_to_next_no_blank_char(&next_no_blank_char)) { - return nullptr; // end of the line, no more dela_entry + return NULL; // end of the line, no more dela_entry } if (next_no_blank_char[0] != '{') { fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", @@ -542,7 +538,7 @@ unichar* complete_first_with_second(const unichar* first, unichar res[INPUTSIZEBUFFER] = {0}; int j = 0; for (size_t i = 0; i < u_strlen(second); i++) { - if (u_strchr(first, second[i]) == nullptr) { + if (u_strchr(first, second[i]) == NULL) { to_add[j] = second[i]; j++; } @@ -558,9 +554,9 @@ unichar* complete_first_with_second(const unichar* first, struct list_ustring* clone_and_replace_copy_command( const struct list_ustring* inflectional_command, const struct dela_entry* tag) { - struct list_ustring* res = nullptr; + struct list_ustring* res = NULL; - while (inflectional_command != nullptr) { + while (inflectional_command != NULL) { if (u_strcmp(inflectional_command->string, Multi2Delaf::COLUMN_COPY) != 0) { res = sorted_insert(inflectional_command->string, res); } else { @@ -579,16 +575,16 @@ struct list_ustring* clone_and_replace_copy_command( * Suppose that l2 is not the empty list. */ struct list_ustring* product(struct list_ustring* l1, struct list_ustring* l2) { - unichar* tmp_code = nullptr; - struct list_ustring* res = nullptr; + unichar* tmp_code = NULL; + struct list_ustring* res = NULL; struct list_ustring* ptr_l1 = l1; struct list_ustring* ptr_l2 = l2; - if (l1 == nullptr) { + if (l1 == NULL) { return clone(l2); } - while (ptr_l1 != nullptr) { + while (ptr_l1 != NULL) { ptr_l2 = l2; - while (ptr_l2 != nullptr) { + while (ptr_l2 != NULL) { tmp_code = complete_first_with_second(ptr_l1->string, ptr_l2->string); res = sorted_insert(tmp_code, res); free(tmp_code); @@ -604,10 +600,10 @@ struct list_ustring* product(struct list_ustring* l1, struct list_ustring* l2) { */ unichar* build_output_codes(const struct list_ustring* codes, char prefix) { unichar buffer[INPUTSIZEBUFFER] = {0}; - if (codes == nullptr) { + if (codes == NULL) { return u_strdup(""); } - while (codes != nullptr) { + while (codes != NULL) { if (u_strlen(buffer) + u_strlen(codes->string) + 2 >= INPUTSIZEBUFFER - 1) { fatal_error( "internal err(build_output_codes): buffer is not " @@ -638,7 +634,7 @@ Multi2Delaf::~Multi2Delaf() { */ void Multi2Delaf::parse_config_file() { U_FILE* config_file = u_fopen(&_vec, _config_filename, U_READ); - if (config_file == nullptr) { + if (config_file == NULL) { fatal_error("Cannot open configuration file %s\n", _config_filename); } load_config_file(config_file); @@ -654,8 +650,8 @@ void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const { unichar* ptr = buffer; struct list_pointer* delaf_tags = NULL; - struct dela_entry* new_tag = nullptr; - while (nullptr != (new_tag = tokenize_delaf_tag(&ptr))) { + struct dela_entry* new_tag = NULL; + while (NULL != (new_tag = tokenize_delaf_tag(&ptr))) { delaf_tags = new_list_pointer(new_tag, delaf_tags); } unichar* inflected = escape_inflected_input(inflected_input); @@ -698,14 +694,14 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { (eof = read_line_config_file(config_file, line, INPUTSIZEBUFFER))) { struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); - if (config_line != nullptr) { + if (config_line != NULL) { _config_lines = new_list_pointer(config_line, _config_lines); } } // the last line is potentially a config line struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); - if (config_line != nullptr) { + if (config_line != NULL) { _config_lines = new_list_pointer(config_line, _config_lines); } } @@ -725,7 +721,7 @@ unichar* Multi2Delaf::retrieve_lemma(struct list_pointer* delaf_tags, while (current_line_ptr != NULL) { struct ConfigLine* current_line = (struct ConfigLine*)current_line_ptr->pointer; - if (current_line->config_command->lemma == nullptr) { + if (current_line->config_command->lemma == NULL) { current_line_ptr = current_line_ptr->next; continue; } @@ -763,7 +759,7 @@ unichar* Multi2Delaf::retrieve_lemma(struct list_pointer* delaf_tags, } fatal_error("No lemma is provided for this multidelaf string: %S\n", multidelaf_string); - return nullptr; + return NULL; } /** @@ -780,7 +776,7 @@ unichar* Multi2Delaf::retrieve_part_of_speech( while (current_line_ptr != NULL) { struct ConfigLine* current_line = (struct ConfigLine*)current_line_ptr->pointer; - if (current_line->config_command->part_of_speech == nullptr) { + if (current_line->config_command->part_of_speech == NULL) { current_line_ptr = current_line_ptr->next; continue; } @@ -819,7 +815,7 @@ unichar* Multi2Delaf::retrieve_part_of_speech( fatal_error( "No grammatical cathegory is provided for this multidelaf string: %S\n", multidelaf_string); - return nullptr; + return NULL; } /** @@ -831,8 +827,8 @@ unichar* Multi2Delaf::retrieve_part_of_speech( */ unichar* Multi2Delaf::retrieve_semantic_codes( struct list_pointer* delaf_tags) const { - struct list_ustring* codes = nullptr; - struct list_ustring* ptr_command = nullptr; + struct list_ustring* codes = NULL; + struct list_ustring* ptr_command = NULL; struct list_pointer* config_lines_ptr = _config_lines; struct list_pointer* delaf_tag_ptr = delaf_tags; struct dela_entry* tag = NULL; @@ -843,13 +839,13 @@ unichar* Multi2Delaf::retrieve_semantic_codes( config_lines_ptr = _config_lines; while (config_lines_ptr != NULL) { struct ConfigLine* line = (struct ConfigLine*)config_lines_ptr->pointer; - if (line->config_command->semantic_codes == nullptr) { + if (line->config_command->semantic_codes == NULL) { config_lines_ptr = config_lines_ptr->next; continue; } if (is_entry_compatible_with_pattern(tag, line->pattern)) { ptr_command = line->config_command->semantic_codes; - while (ptr_command != nullptr) { + while (ptr_command != NULL) { if (line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED || line->nb_required_tag == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { @@ -871,7 +867,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( 0 == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { struct list_ustring* ptr_command = line->config_command->semantic_codes; - while (ptr_command != nullptr) { + while (ptr_command != NULL) { codes = sorted_insert(ptr_command->string, codes); ptr_command = ptr_command->next; } @@ -899,9 +895,9 @@ unichar* Multi2Delaf::retrieve_semantic_codes( */ unichar* Multi2Delaf::retrieve_inflectional_codes( struct list_pointer* delaf_tags) const { - struct list_ustring* codes = nullptr; - struct list_ustring* tmp_codes = nullptr; - struct list_ustring* ptr_command = nullptr; + struct list_ustring* codes = NULL; + struct list_ustring* tmp_codes = NULL; + struct list_ustring* ptr_command = NULL; struct list_pointer* line_ptr = _config_lines; struct list_pointer* delaf_tag_ptr = delaf_tags; struct dela_entry* tag = NULL; @@ -913,7 +909,7 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( while (line_ptr != NULL) { struct ConfigLine* line = (struct ConfigLine*)line_ptr->pointer; tmp_codes = codes; - if (line->config_command->inflectional_codes == nullptr) { + if (line->config_command->inflectional_codes == NULL) { line_ptr = line_ptr->next; continue; } @@ -926,8 +922,8 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( if (is_in_list(Multi2Delaf::COLUMN_COPY, ptr_command)) { struct list_ustring* tmp_lst = clone_and_replace_copy_command(ptr_command, tag); - if (tmp_lst == nullptr) { - codes = product(nullptr, tmp_codes); + if (tmp_lst == NULL) { + codes = product(NULL, tmp_codes); } else { codes = product(tmp_codes, tmp_lst); free_list_ustring(tmp_lst); diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index fa0e00dd..49849762 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -73,6 +73,7 @@ class Multi2Delaf { static constexpr int NOT_SPECIFIED = -1; // nb_required_tag default value Multi2Delaf(const char* config_filename); + ~Multi2Delaf(); // Not copyable or movable Multi2Delaf(const Multi2Delaf&) UNITEX_EQ_DELETE; Multi2Delaf(Multi2Delaf&&) UNITEX_EQ_DELETE; @@ -81,7 +82,6 @@ class Multi2Delaf { void parse_config_file(); void translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const; - ~Multi2Delaf(); private: void load_config_file(U_FILE* config_file); From 4f1f6cc7d0d7585e5491b3bdcf6ebdc6c13fa60e Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Mon, 1 Aug 2022 11:20:57 +0200 Subject: [PATCH 15/24] Replace nullptr by NULL --- src/Multi2Delaf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 49849762..299098ab 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -93,7 +93,7 @@ class Multi2Delaf { unichar* retrieve_inflectional_codes(struct list_pointer* delaf_tags) const; const VersatileEncodingConfig _vec = VEC_DEFAULT; - struct list_pointer* _config_lines = nullptr; + struct list_pointer* _config_lines = NULL; const char* _config_filename; }; From cead070b8cc0d1a8305b0830ef3fd24fe7c3f42c Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Mon, 1 Aug 2022 11:25:15 +0200 Subject: [PATCH 16/24] Remove constexpr keyword --- src/Multi2Delaf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index 299098ab..fb5bd67d 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -66,11 +66,11 @@ struct ConfigLine { */ class Multi2Delaf { public: - static constexpr const char* const COMMA_COPY = ",copy"; - static constexpr const char* const DOT_COPY = ".copy"; - static constexpr const char* const PLUS_COPY = "+copy"; - static constexpr const char* const COLUMN_COPY = ":copy"; - static constexpr int NOT_SPECIFIED = -1; // nb_required_tag default value + static const char* const COMMA_COPY = ",copy"; + static const char* const DOT_COPY = ".copy"; + static const char* const PLUS_COPY = "+copy"; + static const char* const COLUMN_COPY = ":copy"; + static const int NOT_SPECIFIED = -1; // nb_required_tag default value Multi2Delaf(const char* config_filename); ~Multi2Delaf(); From 8ab727f40a35ebfcefc8a9590b2167a8f2cbd466 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Mon, 1 Aug 2022 11:37:54 +0200 Subject: [PATCH 17/24] Move copy command outside of the class --- src/Multi2Delaf.cpp | 46 ++++++++++++++++++++++++--------------------- src/Multi2Delaf.h | 6 +----- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index bf5274c2..30790099 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -38,6 +38,11 @@ namespace unitex { +const char* COMMA_COPY = ",copy"; +const char* DOT_COPY = ".copy"; +const char* PLUS_COPY = "+copy"; +const char* COLUMN_COPY = ":copy"; + /** * Set *str to the next no blank character. * Assumes that *str ended with '\0'. @@ -64,16 +69,16 @@ unichar* tokenize_lemma(unichar** ptr, const char* config_filename) { unichar* line = *ptr; // try to read ,,copy if (line[0] == ',' && line[1] == ',') { - if (!u_starts_with(line + 1, Multi2Delaf::COMMA_COPY)) { + if (!u_starts_with(line + 1, COMMA_COPY)) { fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(Multi2Delaf::COMMA_COPY); + *ptr = line + 1 + strlen(COMMA_COPY); if (**ptr != '\0' && **ptr != '.' && **ptr != '+' && **ptr != ':') { fatal_error("Double ',' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(Multi2Delaf::COMMA_COPY); + return u_strdup(COMMA_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '.' && line[i] != '+' && @@ -93,16 +98,16 @@ unichar* tokenize_part_of_speech(unichar** ptr, const char* config_filename) { unichar* line = *ptr; // try to read ..copy if (line[0] == '.' && line[1] == '.') { - if (!u_starts_with(line + 1, Multi2Delaf::DOT_COPY)) { + if (!u_starts_with(line + 1, DOT_COPY)) { fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(Multi2Delaf::DOT_COPY); + *ptr = line + 1 + strlen(DOT_COPY); if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { fatal_error("Double '.' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(Multi2Delaf::DOT_COPY); + return u_strdup(DOT_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { @@ -122,16 +127,16 @@ unichar* tokenize_one_semantic_code(unichar** ptr, unichar* line = *ptr; // try to read ++copy if (line[0] == '+' && line[1] == '+') { - if (!u_starts_with(line + 1, Multi2Delaf::PLUS_COPY)) { + if (!u_starts_with(line + 1, PLUS_COPY)) { fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(Multi2Delaf::PLUS_COPY); + *ptr = line + 1 + strlen(PLUS_COPY); if (**ptr != '\0' && **ptr != '+' && **ptr != ':') { fatal_error("Double '+' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(Multi2Delaf::PLUS_COPY); + return u_strdup(PLUS_COPY); } int i = 1; while (line[i] != '\0' && line[i] != '+' && line[i] != ':') { @@ -173,16 +178,16 @@ unichar* tokenize_one_inflectional_code(unichar** ptr, unichar* line = *ptr; // try to read ::copy if (line[0] == ':' && line[1] == ':') { - if (!u_starts_with(line + 1, Multi2Delaf::COLUMN_COPY)) { + if (!u_starts_with(line + 1, COLUMN_COPY)) { fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, line); } - *ptr = line + 1 + strlen(Multi2Delaf::COLUMN_COPY); + *ptr = line + 1 + strlen(COLUMN_COPY); if (**ptr != '\0' && **ptr != ':') { fatal_error("Double ':' in file: %s, line: '%S'\n", config_filename, line); } - return u_strdup(Multi2Delaf::COLUMN_COPY); + return u_strdup(COLUMN_COPY); } int i = 1; while (line[i] != '\0' && line[i] != ':') { @@ -393,14 +398,14 @@ struct ConfigLine* tokenize_config_line(unichar* line, // build config command struct ConfigCommand* config_command = tokenize_config_command(nextNoEmptyUnichar, config_filename); - if (u_strcmp(config_command->lemma, Multi2Delaf::COMMA_COPY) == 0 && + if (u_strcmp(config_command->lemma, COMMA_COPY) == 0 && nb_required_tag != Multi2Delaf::NOT_SPECIFIED && nb_required_tag != 1) { fatal_error( "Command ,,copy is incompatible with an integer enclosed in curly " "braces, except for {1} in file: %s, line: '%S'\n", config_filename, line); } - if (u_strcmp(config_command->part_of_speech, Multi2Delaf::DOT_COPY) == 0 && + if (u_strcmp(config_command->part_of_speech, DOT_COPY) == 0 && nb_required_tag != Multi2Delaf::NOT_SPECIFIED && nb_required_tag != 1) { fatal_error( "Command ..copy is incompatible with an integer enclosed in curly " @@ -557,7 +562,7 @@ struct list_ustring* clone_and_replace_copy_command( struct list_ustring* res = NULL; while (inflectional_command != NULL) { - if (u_strcmp(inflectional_command->string, Multi2Delaf::COLUMN_COPY) != 0) { + if (u_strcmp(inflectional_command->string, COLUMN_COPY) != 0) { res = sorted_insert(inflectional_command->string, res); } else { for (int i = 0; i < tag->n_inflectional_codes; i++) { @@ -729,8 +734,7 @@ unichar* Multi2Delaf::retrieve_lemma(struct list_pointer* delaf_tags, while (delaf_tags_ptr != NULL) { tag = (struct dela_entry*)delaf_tags_ptr->pointer; if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { - if (u_strcmp(current_line->config_command->lemma, - Multi2Delaf::COMMA_COPY) == 0) { + if (u_strcmp(current_line->config_command->lemma, COMMA_COPY) == 0) { if (nb_delaf_tag_that_match_pattern(delaf_tags, current_line->pattern) != 1) { fatal_error( @@ -785,8 +789,8 @@ unichar* Multi2Delaf::retrieve_part_of_speech( // for (const auto& tag : delaf_tags) { tag = (struct dela_entry*)delaf_tag_ptr->pointer; if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { - if (u_strcmp(current_line->config_command->part_of_speech, - Multi2Delaf::DOT_COPY) == 0) { + if (u_strcmp(current_line->config_command->part_of_speech, DOT_COPY) == + 0) { if (nb_delaf_tag_that_match_pattern(delaf_tags, current_line->pattern) != 1) { fatal_error( @@ -849,7 +853,7 @@ unichar* Multi2Delaf::retrieve_semantic_codes( if (line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED || line->nb_required_tag == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { - if (u_strcmp(ptr_command->string, Multi2Delaf::PLUS_COPY) == 0) { + if (u_strcmp(ptr_command->string, PLUS_COPY) == 0) { for (int i = 1; i < tag->n_semantic_codes; i++) { // begin at 1 to skip the grammatical catergory codes = sorted_insert(tag->semantic_codes[i], codes); @@ -919,7 +923,7 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( if (line->nb_required_tag == Multi2Delaf::NOT_SPECIFIED || line->nb_required_tag == nb_delaf_tag_that_match_pattern(delaf_tags, line->pattern)) { - if (is_in_list(Multi2Delaf::COLUMN_COPY, ptr_command)) { + if (is_in_list(COLUMN_COPY, ptr_command)) { struct list_ustring* tmp_lst = clone_and_replace_copy_command(ptr_command, tag); if (tmp_lst == NULL) { diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index fb5bd67d..bd060e6a 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -66,11 +66,7 @@ struct ConfigLine { */ class Multi2Delaf { public: - static const char* const COMMA_COPY = ",copy"; - static const char* const DOT_COPY = ".copy"; - static const char* const PLUS_COPY = "+copy"; - static const char* const COLUMN_COPY = ":copy"; - static const int NOT_SPECIFIED = -1; // nb_required_tag default value + static const int NOT_SPECIFIED = -1; // nb_required_tag default value Multi2Delaf(const char* config_filename); ~Multi2Delaf(); From 85f10f0e0408615841d8701c8ee86fed3d1ee326 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 14:10:04 +0200 Subject: [PATCH 18/24] Replaces copy constructor and operator= by macro --- src/Multi2Delaf.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Multi2Delaf.h b/src/Multi2Delaf.h index bd060e6a..44272491 100644 --- a/src/Multi2Delaf.h +++ b/src/Multi2Delaf.h @@ -71,10 +71,7 @@ class Multi2Delaf { Multi2Delaf(const char* config_filename); ~Multi2Delaf(); // Not copyable or movable - Multi2Delaf(const Multi2Delaf&) UNITEX_EQ_DELETE; - Multi2Delaf(Multi2Delaf&&) UNITEX_EQ_DELETE; - Multi2Delaf& operator=(const Multi2Delaf& other) UNITEX_EQ_DELETE; - Multi2Delaf& operator=(Multi2Delaf&& other) UNITEX_EQ_DELETE; + UNITEX_DISALLOW_COPY_AND_ASSIGN(Multi2Delaf); void parse_config_file(); void translate_multidelaf_to_delaf(const unichar* inflected_input, unichar* buffer) const; From 77b6a38d542bfbbec741bdedb0497b56dffeff7f Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 14:13:37 +0200 Subject: [PATCH 19/24] Place * in right place --- src/Fst2List.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Fst2List.cpp b/src/Fst2List.cpp index 84320aa8..d40b5a67 100644 --- a/src/Fst2List.cpp +++ b/src/Fst2List.cpp @@ -300,7 +300,7 @@ class CFstApp { struct hash_table* path_to_stop; /* a hash table to know all the Fst2Tag whose path exploration must be interrupted */ struct hash_table* dela_entries; /* a hash table to get the dela_entries of created boxes when lexical masks are processed */ bool compileToDelaf = false; - Multi2Delaf *multi2Delaf = NULL; + Multi2Delaf* multi2Delaf = NULL; bool isMorphological = false; bool makeDic = false; @@ -2934,7 +2934,7 @@ const struct option_TS lopts_Fst2List[]= { int main_Fst2List(int argc, char* const argv[]) { char* ofilename = NULL; char morpho_dic[1025] = ""; - char *config_file_name = NULL; + char* config_file_name = NULL; unichar changeStrTo[16][MAX_CHANGE_SYMBOL_SIZE]; int changeStrToIdx; From dddb8ac4e4edb1c63e703201dab0d9a1a12e3e43 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 14:16:36 +0200 Subject: [PATCH 20/24] Add Multi2Delaf.cpp in Unitex4_vs2019.* --- src/build/Unitex4_vs2019.vcxproj | 1 + src/build/Unitex4_vs2019.vcxproj.filters | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/build/Unitex4_vs2019.vcxproj b/src/build/Unitex4_vs2019.vcxproj index d7a85315..e3de70c2 100644 --- a/src/build/Unitex4_vs2019.vcxproj +++ b/src/build/Unitex4_vs2019.vcxproj @@ -522,6 +522,7 @@ + diff --git a/src/build/Unitex4_vs2019.vcxproj.filters b/src/build/Unitex4_vs2019.vcxproj.filters index 8fa70b62..f4665251 100644 --- a/src/build/Unitex4_vs2019.vcxproj.filters +++ b/src/build/Unitex4_vs2019.vcxproj.filters @@ -553,6 +553,9 @@ Source Files + + Source Files + Source Files From 075a8943b71619ce9161c2e8435604660d548eeb Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 14:19:42 +0200 Subject: [PATCH 21/24] Corrects the order in which the configuration lines are added. --- src/Multi2Delaf.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 30790099..a4576da9 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -695,18 +695,36 @@ void Multi2Delaf::translate_multidelaf_to_delaf(const unichar* inflected_input, void Multi2Delaf::load_config_file(U_FILE* config_file) { unichar line[INPUTSIZEBUFFER] = {0}; int eof = 0; + struct list_pointer* lines_index = NULL; while (EOF != (eof = read_line_config_file(config_file, line, INPUTSIZEBUFFER))) { struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != NULL) { - _config_lines = new_list_pointer(config_line, _config_lines); + if(_config_lines == NULL) { + _config_lines = new_list_pointer(config_line, NULL); + } else { + lines_index = _config_lines; + while(lines_index->next != NULL) { + lines_index = lines_index->next; + } + lines_index->next = new_list_pointer(config_line, NULL); + } } } // the last line is potentially a config line struct ConfigLine* config_line = tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != NULL) { + if (_config_lines == NULL) { + _config_lines = new_list_pointer(config_line, _config_lines); + }else { + lines_index = _config_lines; + while(lines_index->next != NULL) { + lines_index = lines_index->next; + } + lines_index->next = new_list_pointer(config_line, NULL); + } _config_lines = new_list_pointer(config_line, _config_lines); } } From 66ff283cc100f7bbaf0e2538d6bd770f0c45f11b Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 20:19:51 +0200 Subject: [PATCH 22/24] Fix memory leak --- src/Multi2Delaf.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index a4576da9..facb137a 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -416,10 +416,13 @@ struct ConfigLine* tokenize_config_line(unichar* line, } void free_config_line(void* void_line) { + if(void_line == NULL) { + return; + } struct ConfigLine* line = (struct ConfigLine*)void_line; free_pattern(line->pattern); - free(line); free_config_command(line->config_command); + free(line); } /** @@ -717,7 +720,7 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { tokenize_config_line(line, filename_without_path(_config_filename)); if (config_line != NULL) { if (_config_lines == NULL) { - _config_lines = new_list_pointer(config_line, _config_lines); + _config_lines = new_list_pointer(config_line, NULL); }else { lines_index = _config_lines; while(lines_index->next != NULL) { @@ -725,7 +728,6 @@ void Multi2Delaf::load_config_file(U_FILE* config_file) { } lines_index->next = new_list_pointer(config_line, NULL); } - _config_lines = new_list_pointer(config_line, _config_lines); } } From e193484f18ef1673f65cf8edaf077e24bda5ee6c Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Thu, 8 Sep 2022 20:25:22 +0200 Subject: [PATCH 23/24] Remove unnecessary comment --- src/Multi2Delaf.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index facb137a..19ba92a5 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -806,7 +806,6 @@ unichar* Multi2Delaf::retrieve_part_of_speech( } delaf_tag_ptr = delaf_tags; while (delaf_tag_ptr != NULL) { - // for (const auto& tag : delaf_tags) { tag = (struct dela_entry*)delaf_tag_ptr->pointer; if (is_entry_compatible_with_pattern(tag, current_line->pattern)) { if (u_strcmp(current_line->config_command->part_of_speech, DOT_COPY) == @@ -858,7 +857,6 @@ unichar* Multi2Delaf::retrieve_semantic_codes( struct dela_entry* tag = NULL; while (delaf_tag_ptr != NULL) { - // for (const auto& tag : delaf_tags) { tag = (struct dela_entry*)delaf_tag_ptr->pointer; config_lines_ptr = _config_lines; while (config_lines_ptr != NULL) { @@ -927,7 +925,6 @@ unichar* Multi2Delaf::retrieve_inflectional_codes( struct dela_entry* tag = NULL; while (delaf_tag_ptr != NULL) { - // for (const auto& tag : delaf_tags) { tag = (struct dela_entry*)delaf_tag_ptr->pointer; line_ptr = _config_lines; while (line_ptr != NULL) { From 4f54caecc23ac2639af97ec09a97ca90128658c8 Mon Sep 17 00:00:00 2001 From: Sofian El Guetibi Date: Fri, 9 Sep 2022 16:44:36 +0200 Subject: [PATCH 24/24] Ignores characters between delaf tags --- src/Multi2Delaf.cpp | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/Multi2Delaf.cpp b/src/Multi2Delaf.cpp index 19ba92a5..7ee3b585 100644 --- a/src/Multi2Delaf.cpp +++ b/src/Multi2Delaf.cpp @@ -60,6 +60,23 @@ int advance_to_next_no_blank_char(unichar** str) { return 0; } +/** + * Set *str to the next delaf tag. + * Assumes that *str ended with '\0'. + * Returns 1 if *str does not contain any more delaf tag, otherwise returns 0. + */ +int advance_to_next_delaf_tag(unichar** str) { + int i = 0; + while ((*str)[i] != '\0' && (*str)[i] != '{') { + i++; + } + if ((*str)[i] == '\0') { + return 1; + } + *str = *str + i; + return 0; +} + /** * Return a new allocated unicode string describing the lemma. * Set *ptr to the next unread character. @@ -433,26 +450,22 @@ void free_config_line(void* void_line) { */ struct dela_entry* tokenize_delaf_tag(unichar** ptr) { unichar* line = *ptr; - unichar* next_no_blank_char = line; - if (advance_to_next_no_blank_char(&next_no_blank_char)) { + unichar* next_delaf_tag = line; + if (advance_to_next_delaf_tag(&next_delaf_tag)) { return NULL; // end of the line, no more dela_entry } - if (next_no_blank_char[0] != '{') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", - line); - } - int i = 1; - while (next_no_blank_char[i] != '\0' && next_no_blank_char[i] != '}') { + int i = 1; // 1 to skip opening brace '{' + while (next_delaf_tag[i] != '\0' && next_delaf_tag[i] != '}') { i++; } - if (next_no_blank_char[i] == '\0') { - fatal_error("Delaf tag must be enclosed in curly braces, line: '%S'\n", + if (next_delaf_tag[i] == '\0') { + fatal_error("Unclosed curly brace: Delaf tag must be enclosed in curly braces, line: '%S'\n", line); } - unichar* token_dela_entry = u_strndup(next_no_blank_char + 1, i - 1); + unichar* token_dela_entry = u_strndup(next_delaf_tag + 1, i - 1); struct dela_entry* tag = tokenize_DELAF_line(token_dela_entry); free(token_dela_entry); - *ptr = next_no_blank_char + i + 1; // + 1 to skip closing brace '}' + *ptr = next_delaf_tag + i + 1; // + 1 to skip closing brace '}' return tag; }