From 9d9399186e321a547d6cc4cad34fc5ff4d3af5ac Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Fri, 24 Jan 2025 12:05:47 +0800 Subject: [PATCH 1/6] Improved patterns command with new algorithm Signed-off-by: Songkan Tang --- .../sql/common/patterns/BrainLogParser.java | 389 ++++++++++++++++++ .../org/opensearch/sql/analysis/Analyzer.java | 28 ++ .../sql/ast/AbstractNodeVisitor.java | 5 + .../org/opensearch/sql/ast/dsl/AstDSL.java | 18 + .../sql/ast/expression/ParseMethod.java | 3 +- .../sql/ast/expression/PatternMethod.java | 20 + .../org/opensearch/sql/ast/tree/Pattern.java | 53 +++ .../org/opensearch/sql/expression/DSL.java | 8 + .../function/BuiltinFunctionName.java | 3 + .../expression/parse/PatternsExpression.java | 2 +- .../expression/window/WindowFunctions.java | 26 ++ .../frame/BufferPatternRowsWindowFrame.java | 62 +++ .../window/frame/PeerRowsWindowFrame.java | 24 +- .../frame/StreamPatternRowWindowFrame.java | 26 ++ .../patterns/BufferPatternWindowFunction.java | 88 ++++ .../patterns/StreamPatternWindowFunction.java | 80 ++++ .../org/opensearch/sql/utils/ParseUtils.java | 11 +- .../opensearch/sql/analysis/AnalyzerTest.java | 52 --- .../parse/PatternsExpressionTest.java | 89 ---- ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 5 + ppl/src/main/antlr/OpenSearchPPLParser.g4 | 17 +- .../opensearch/sql/ppl/parser/AstBuilder.java | 29 +- .../sql/ppl/parser/AstBuilderTest.java | 27 -- 23 files changed, 867 insertions(+), 198 deletions(-) create mode 100644 common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java create mode 100644 core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java create mode 100644 core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java create mode 100644 core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java create mode 100644 core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java create mode 100644 core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java create mode 100644 core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java delete mode 100644 core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java diff --git a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java new file mode 100644 index 0000000000..dfa0d17779 --- /dev/null +++ b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java @@ -0,0 +1,389 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.common.patterns; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.OptionalLong; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** Log parser Brain algorithm implementation. See: https://ieeexplore.ieee.org/document/10109145 */ +public class BrainLogParser { + + private static final String VARIABLE_DENOTER = "<*>"; + private static final Map DEFAULT_FILTER_PATTERN_VARIABLE_MAP = + new LinkedHashMap<>(); + + static { + // IP + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), "<*IP*>"); + // Simple ISO date and time + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile( + "(\\d{4}-\\d{2}-\\d{2})[T" + + " ]?(\\d{2}:\\d{2}:\\d{2})(\\.\\d{3})?(Z|([+-]\\d{2}:?\\d{2}))?"), + "<*DATETIME*>"); + // Hex Decimal, letters followed by digits, float numbers + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(\\d+(\\.\\d*)?|\\.\\d+))"), + VARIABLE_DENOTER); + // generic number surrounded by non-alphanumeric + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile("(?<=[^A-Za-z0-9])(-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$"), VARIABLE_DENOTER); + } + + private static final List DEFAULT_DELIMITERS = List.of(",", "+"); + // counting frequency will be grouped by composite of position and token string + private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s"; + // Token set will be grouped by composite of tokens length per log message, word combination + // candidate and token position. + private static final String GROUP_TOKEN_SET_KEY_FORMAT = "%d-%s-%d"; + // By default, algorithm treats more than 2 different tokens in the group per position as variable + // token + private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 2; + /* + * By default, algorithm treats the longest word combinations as the group root, no matter what its frequency is. + * Otherwise, the longest word combination will be selected when frequency >= highest frequency of log * threshold percentage + */ + private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.0f; + + private final Map tokenFreqMap; + private final Map> groupTokenSetMap; + private final Map logIdGroupCandidateMap; + private final int variableCountThreshold; + private final float thresholdPercentage; + private final Map filterPatternVariableMap; + private final List delimiters; + + /** Creates new Brain log parser with default parameters */ + public BrainLogParser() { + this( + DEFAULT_VARIABLE_COUNT_THRESHOLD, + DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, + DEFAULT_FILTER_PATTERN_VARIABLE_MAP, + DEFAULT_DELIMITERS); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage + * + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + * @param thresholdPercentage the threshold percentage to decide which frequency is representative + * frequency per log message + */ + public BrainLogParser(int variableCountThreshold, float thresholdPercentage) { + this( + variableCountThreshold, + thresholdPercentage, + DEFAULT_FILTER_PATTERN_VARIABLE_MAP, + DEFAULT_DELIMITERS); + } + + /** + * Creates new Brain log parser with overridden filter patterns and delimiters + * + * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the + * matched pattern will be replaced, recommend to use LinkedHashMap to make sure patterns in + * order + * @param delimiters a list of delimiters to be replaced with empty string after regex replacement + */ + public BrainLogParser(Map filterPatternVariableMap, List delimiters) { + this( + DEFAULT_VARIABLE_COUNT_THRESHOLD, + DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, + filterPatternVariableMap, + delimiters); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage and + * overridden filter patterns and delimiters + * + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + * @param thresholdPercentage the threshold percentage to decide which frequency is representative + * frequency per log message + * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the + * matched pattern will be replaced, recommend to use LinkedHashMap to make sure patterns in + * order + * @param delimiters a list of delimiters to be replaced with empty string after regex replacement + */ + public BrainLogParser( + int variableCountThreshold, + float thresholdPercentage, + Map filterPatternVariableMap, + List delimiters) { + if (thresholdPercentage < 0.0f || thresholdPercentage > 1.0f) { + throw new IllegalArgumentException("Threshold percentage must be between 0.0 and 1.0"); + } + this.tokenFreqMap = new HashMap<>(); + this.groupTokenSetMap = new HashMap<>(); + this.logIdGroupCandidateMap = new HashMap<>(); + this.variableCountThreshold = variableCountThreshold; + this.thresholdPercentage = thresholdPercentage; + this.filterPatternVariableMap = filterPatternVariableMap; + this.delimiters = delimiters; + } + + /** + * Preprocess single line of log message with logId + * + * @param logMessage log message body per log + * @param logId logId of the log + * @return list of tokens by splitting preprocessed log message + */ + public List preprocess(String logMessage, String logId) { + if (logMessage == null || logId == null) { + throw new IllegalArgumentException("log message or logId must not be null"); + } + // match regex and replace it with variable denoter in order + for (Map.Entry patternVariablePair : filterPatternVariableMap.entrySet()) { + logMessage = + patternVariablePair + .getKey() + .matcher(logMessage) + .replaceAll(patternVariablePair.getValue()); + } + + for (String delimiter : delimiters) { + logMessage = logMessage.replace(delimiter, " "); + } + + // Append logId/docId to the end of the split tokens + logMessage = logMessage.trim() + " " + logId; + + return Arrays.asList(logMessage.split("\\s+")); + } + + /** + * Count token frequency per position/index in the token list + * + * @param tokens list of tokens from preprocessed log message + */ + public void processTokenHistogram(List tokens) { + // Ignore last element since it's designed to be appended logId + for (int i = 0; i < tokens.size() - 1; i++) { + String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i)); + tokenFreqMap.compute(tokenKey, (k, v) -> v == null ? 1 : v + 1); + } + } + + /** + * Preprocess all lines of log messages with logId list. Empty logId list is allowed as the index + * within the list will be logId by default + * + * @param logMessages list of log messages + * @return list of token lists + */ + public List> preprocessAllLogs(List logMessages) { + List> preprocessedLogs = new ArrayList<>(); + + for (int i = 0; i < logMessages.size(); i++) { + String logId = String.valueOf(i); + List tokens = this.preprocess(logMessages.get(i), logId); + preprocessedLogs.add(tokens); + this.processTokenHistogram(tokens); + } + + this.calculateGroupTokenFreq(preprocessedLogs); + + return preprocessedLogs; + } + + /** + * The second process step to calculate initial groups of tokens based on previous token + * histogram. The group will be represented by the representative word combination of the log + * message. The word combination usually selects the longest word combination with the same + * frequency that should be above designed threshold. + * + *

Within initial group, new group level token set per position is counted for final log + * pattern calculation + * + * @param preprocessedLogs preprocessed list of log messages + */ + private void calculateGroupTokenFreq(List> preprocessedLogs) { + for (List tokens : preprocessedLogs) { + Map wordOccurrences = this.getWordOccurrences(tokens); + List sortedWordCombinations = + wordOccurrences.entrySet().stream() + .map(entry -> new WordCombination(entry.getKey(), entry.getValue())) + .sorted() + .toList(); + WordCombination candidate = this.findCandidate(sortedWordCombinations); + String groupCandidateStr = + String.format(Locale.ROOT, "%d,%d", candidate.wordFreq(), candidate.sameFreqCount()); + this.logIdGroupCandidateMap.put(tokens.getLast(), groupCandidateStr); + this.updateGroupTokenFreqMap(tokens, groupCandidateStr); + } + } + + /** + * Parse single line of log pattern after preprocess - processTokenHistogram - + * calculateGroupTokenFreq + * + * @param tokens list of tokens for a specific log message + * @return parsed log pattern that is a list of string + */ + public List parseLogPattern(List tokens) { + String logId = tokens.getLast(); + String groupCandidateStr = this.logIdGroupCandidateMap.get(logId); + String[] groupCandidate = groupCandidateStr.split(","); + Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group + return IntStream.range(0, tokens.size() - 1) + .mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))) + .map( + entry -> { + int index = entry.getKey(); + String token = entry.getValue(); + String tokenKey = + String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, index, token); + assert this.tokenFreqMap.get(tokenKey) != null + : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index); + + boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq; + boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq; + String groupTokenKey = + String.format( + Locale.ROOT, + GROUP_TOKEN_SET_KEY_FORMAT, + tokens.size() - 1, + groupCandidateStr, + index); + assert this.groupTokenSetMap.get(groupTokenKey) != null + : String.format(Locale.ROOT, "Not found any token in group: %s", groupTokenKey); + + if (isHigherFrequency) { + // For higher frequency token that doesn't belong to word combination, it's likely + // to be constant token only if + // it's unique token on that position within the group + boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1; + if (!isUniqueToken) { + return VARIABLE_DENOTER; + } + } else if (isLowerFrequency) { + // For lower frequency token that doesn't belong to word combination, it's likely to + // be constant token only if + // it doesn't exceed the preset variable count threshold. For example, some variable + // are limited number of enums, + // and sometimes they could be treated as constant tokens. + if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) { + return VARIABLE_DENOTER; + } + } + return token; + }) + .collect(Collectors.toList()); + } + + /** + * Parse all lines of log messages to generate the log pattern map. + * + * @param logMessages all lines of log messages + * @return log pattern map with log pattern string as key, grouped logIds as value + */ + public Map> parseAllLogPatterns(List logMessages) { + List> processedMessages = this.preprocessAllLogs(logMessages); + + Map> logPatternMap = new HashMap<>(); + for (List processedMessage : processedMessages) { + String logId = processedMessage.getLast(); + List logPattern = this.parseLogPattern(processedMessage); + String patternKey = String.join(" ", logPattern); + logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId); + } + return logPatternMap; + } + + /** + * Get token histogram + * + * @return map of token per position key and its frequency + */ + public Map getTokenFreqMap() { + return this.tokenFreqMap; + } + + /** + * Get group per length per position to its token set map + * + * @return map of pattern group per length per position key and its token set + */ + public Map> getGroupTokenSetMap() { + return this.groupTokenSetMap; + } + + /** + * Get logId to its group candidate map + * + * @return map of logId and group candidate + */ + public Map getLogIdGroupCandidateMap() { + return this.logIdGroupCandidateMap; + } + + private Map getWordOccurrences(List tokens) { + Map occurrences = new HashMap<>(); + for (int i = 0; i < tokens.size() - 1; i++) { + String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i)); + Long tokenFreq = tokenFreqMap.get(tokenKey); + occurrences.compute(tokenFreq, (k, v) -> v == null ? 1 : v + 1); + } + return occurrences; + } + + private WordCombination findCandidate(List sortedWordCombinations) { + if (sortedWordCombinations.isEmpty()) { + throw new IllegalArgumentException("Sorted word combinations must be non empty"); + } + OptionalLong maxFreqOptional = + sortedWordCombinations.stream().mapToLong(WordCombination::wordFreq).max(); + long maxFreq = maxFreqOptional.getAsLong(); + float threshold = maxFreq * this.thresholdPercentage; + for (WordCombination wordCombination : sortedWordCombinations) { + if (wordCombination.wordFreq() > threshold) { + return wordCombination; + } + } + return sortedWordCombinations.getFirst(); + } + + private void updateGroupTokenFreqMap(List tokens, String groupCandidateStr) { + int tokensLen = tokens.size() - 1; + for (int i = 0; i < tokensLen; i++) { + String groupTokenFreqKey = + String.format(Locale.ROOT, GROUP_TOKEN_SET_KEY_FORMAT, tokensLen, groupCandidateStr, i); + this.groupTokenSetMap + .computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()) + .add(tokens.get(i)); + } + } + + private record WordCombination(Long wordFreq, Integer sameFreqCount) + implements Comparable { + + @Override + public int compareTo(WordCombination other) { + // Compare by same frequency count in descending order + int wordFreqComparison = other.sameFreqCount.compareTo(this.sameFreqCount); + if (wordFreqComparison != 0) { + return wordFreqComparison; + } + + // If sameFreqCount are the same, compare by wordFreq in descending order + return other.wordFreq.compareTo(this.wordFreq); + } + } +} diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index d0051568c4..d3e6e666eb 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -36,6 +36,7 @@ import org.opensearch.sql.analysis.symbol.Namespace; import org.opensearch.sql.analysis.symbol.Symbol; import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Let; @@ -58,6 +59,7 @@ import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; +import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.Relation; @@ -107,6 +109,7 @@ import org.opensearch.sql.planner.logical.LogicalSort; import org.opensearch.sql.planner.logical.LogicalTrendline; import org.opensearch.sql.planner.logical.LogicalValues; +import org.opensearch.sql.planner.logical.LogicalWindow; import org.opensearch.sql.planner.physical.datasource.DataSourceTable; import org.opensearch.sql.storage.Table; import org.opensearch.sql.utils.ParseUtils; @@ -471,6 +474,31 @@ public LogicalPlan visitParse(Parse node, AnalysisContext context) { return child; } + @Override + public LogicalPlan visitPattern(Pattern node, AnalysisContext context) { + LogicalPlan child = node.getChild().get(0).accept(this, context); + WindowExpressionAnalyzer windowAnalyzer = + new WindowExpressionAnalyzer(expressionAnalyzer, child); + child = windowAnalyzer.analyze(node.getPatternWindowFunction(), context); + java.util.Map arguments = node.getArguments(); + Literal alias = arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")); + + TypeEnvironment curEnv = context.peek(); + if (child instanceof LogicalWindow patternWindow) { + NamedExpression namedExpression = + new NamedExpression( + patternWindow.getWindowFunction().getNameOrAlias(), + new ReferenceExpression( + patternWindow.getWindowFunction().getNameOrAlias(), + patternWindow.getWindowFunction().getDelegated().type())); + curEnv.define( + new Symbol(Namespace.FIELD_NAME, namedExpression.getNameOrAlias()), + namedExpression.type()); + } + + return child; + } + /** Build {@link LogicalSort}. */ @Override public LogicalPlan visitSort(Sort node, AnalysisContext context) { diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index f27260dd5f..5b2e9f35b4 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -53,6 +53,7 @@ import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; +import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.Relation; @@ -215,6 +216,10 @@ public T visitParse(Parse node, C context) { return visitChildren(node, context); } + public T visitPattern(Pattern node, C context) { + return visitChildren(node, context); + } + public T visitLet(Let node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index d9956609ec..1c99697ad4 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -36,6 +36,7 @@ import org.opensearch.sql.ast.expression.Not; import org.opensearch.sql.ast.expression.Or; import org.opensearch.sql.ast.expression.ParseMethod; +import org.opensearch.sql.ast.expression.PatternMethod; import org.opensearch.sql.ast.expression.QualifiedName; import org.opensearch.sql.ast.expression.ScoreFunction; import org.opensearch.sql.ast.expression.Span; @@ -54,6 +55,7 @@ import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Limit; import org.opensearch.sql.ast.tree.Parse; +import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; @@ -489,6 +491,22 @@ public static Parse parse( return new Parse(parseMethod, sourceField, pattern, arguments, input); } + public static Pattern pattern( + UnresolvedPlan input, + PatternMethod patternMethod, + UnresolvedExpression sourceField, + String alias, + java.util.Map arguments) { + return new Pattern( + new Alias( + "patterns_field", + new WindowFunction( + new Function(patternMethod.getName(), List.of(sourceField)), List.of(), List.of()), + alias), + arguments, + input); + } + public static FillNull fillNull(UnresolvedExpression replaceNullWithMe, Field... fields) { return new FillNull( FillNull.ContainNullableFieldFill.ofSameValue( diff --git a/core/src/main/java/org/opensearch/sql/ast/expression/ParseMethod.java b/core/src/main/java/org/opensearch/sql/ast/expression/ParseMethod.java index 7a2587c5f0..aad093554f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/expression/ParseMethod.java +++ b/core/src/main/java/org/opensearch/sql/ast/expression/ParseMethod.java @@ -11,8 +11,7 @@ @RequiredArgsConstructor public enum ParseMethod { REGEX("regex"), - GROK("grok"), - PATTERNS("patterns"); + GROK("grok"); @Getter private final String name; } diff --git a/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java new file mode 100644 index 0000000000..8eb0080cdf --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java @@ -0,0 +1,20 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.sql.ast.expression; + +import lombok.Getter; +import lombok.RequiredArgsConstructor; + +@RequiredArgsConstructor +public enum PatternMethod { + SIMPLE("simple"), + BRAIN("brain"); + + @Getter final String name; +} diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java b/core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java new file mode 100644 index 0000000000..08bf670159 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java @@ -0,0 +1,53 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import java.util.Map; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.Setter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.Literal; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +@Getter +@Setter +@ToString +@EqualsAndHashCode(callSuper = false) +@RequiredArgsConstructor +@AllArgsConstructor +public class Pattern extends UnresolvedPlan { + + private final UnresolvedExpression patternWindowFunction; + + private final Map arguments; + + private UnresolvedPlan child; + + @Override + public Pattern attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitPattern(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/DSL.java b/core/src/main/java/org/opensearch/sql/expression/DSL.java index 44ecc2bc86..9807cf2c8d 100644 --- a/core/src/main/java/org/opensearch/sql/expression/DSL.java +++ b/core/src/main/java/org/opensearch/sql/expression/DSL.java @@ -969,6 +969,14 @@ public static FunctionExpression utc_timestamp( return compile(functionProperties, BuiltinFunctionName.UTC_TIMESTAMP, args); } + public static FunctionExpression brain() { + return compile(FunctionProperties.None, BuiltinFunctionName.BRAIN); + } + + public static FunctionExpression simple_pattern() { + return compile(FunctionProperties.None, BuiltinFunctionName.SIMPLE_PATTERN); + } + @SuppressWarnings("unchecked") private static T compile( FunctionProperties functionProperties, BuiltinFunctionName bfn, Expression... args) { diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index f8e9cf7c5f..11c6feb65e 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -216,6 +216,9 @@ public enum BuiltinFunctionName { RANK(FunctionName.of("rank")), DENSE_RANK(FunctionName.of("dense_rank")), + SIMPLE_PATTERN(FunctionName.of("simple_pattern")), + BRAIN(FunctionName.of("brain")), + INTERVAL(FunctionName.of("interval")), /** Data Type Convert Function. */ diff --git a/core/src/main/java/org/opensearch/sql/expression/parse/PatternsExpression.java b/core/src/main/java/org/opensearch/sql/expression/parse/PatternsExpression.java index 5b92779c35..038f86f2cc 100644 --- a/core/src/main/java/org/opensearch/sql/expression/parse/PatternsExpression.java +++ b/core/src/main/java/org/opensearch/sql/expression/parse/PatternsExpression.java @@ -50,7 +50,7 @@ public PatternsExpression(Expression sourceField, Expression pattern, Expression } @Override - ExprValue parseValue(ExprValue value) throws ExpressionEvaluationException { + public ExprValue parseValue(ExprValue value) throws ExpressionEvaluationException { String rawString = value.stringValue(); if (useCustomPattern) { return new ExprStringValue(pattern.matcher(rawString).replaceAll("")); diff --git a/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java b/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java index 3df59c52c0..6a44d28521 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java @@ -6,8 +6,10 @@ package org.opensearch.sql.expression.window; import static java.util.Collections.emptyList; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; import com.google.common.collect.ImmutableMap; +import java.util.Collections; import java.util.function.Supplier; import lombok.experimental.UtilityClass; import org.opensearch.sql.expression.function.BuiltinFunctionName; @@ -16,6 +18,8 @@ import org.opensearch.sql.expression.function.FunctionBuilder; import org.opensearch.sql.expression.function.FunctionName; import org.opensearch.sql.expression.function.FunctionSignature; +import org.opensearch.sql.expression.window.patterns.BufferPatternWindowFunction; +import org.opensearch.sql.expression.window.patterns.StreamPatternWindowFunction; import org.opensearch.sql.expression.window.ranking.DenseRankFunction; import org.opensearch.sql.expression.window.ranking.RankFunction; import org.opensearch.sql.expression.window.ranking.RankingWindowFunction; @@ -34,6 +38,8 @@ public void register(BuiltinFunctionRepository repository) { repository.register(rowNumber()); repository.register(rank()); repository.register(denseRank()); + repository.register(brain()); + repository.register(simplePattern()); } private DefaultFunctionResolver rowNumber() { @@ -48,6 +54,26 @@ private DefaultFunctionResolver denseRank() { return rankingFunction(BuiltinFunctionName.DENSE_RANK.getName(), DenseRankFunction::new); } + private DefaultFunctionResolver brain() { + FunctionName functionName = BuiltinFunctionName.BRAIN.getName(); + FunctionSignature functionSignature = + new FunctionSignature(functionName, Collections.singletonList(STRING)); + FunctionBuilder functionBuilder = + (functionProperties, arguments) -> new BufferPatternWindowFunction(arguments); + return new DefaultFunctionResolver( + functionName, ImmutableMap.of(functionSignature, functionBuilder)); + } + + private DefaultFunctionResolver simplePattern() { + FunctionName functionName = BuiltinFunctionName.SIMPLE_PATTERN.getName(); + FunctionSignature functionSignature = + new FunctionSignature(functionName, Collections.singletonList(STRING)); + FunctionBuilder functionBuilder = + (functionProperties, arguments) -> new StreamPatternWindowFunction(arguments); + return new DefaultFunctionResolver( + functionName, ImmutableMap.of(functionSignature, functionBuilder)); + } + private DefaultFunctionResolver rankingFunction( FunctionName functionName, Supplier constructor) { FunctionSignature functionSignature = new FunctionSignature(functionName, emptyList()); diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java new file mode 100644 index 0000000000..5212d08f63 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java @@ -0,0 +1,62 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.sql.expression.window.frame; + +import com.google.common.collect.PeekingIterator; +import java.util.ArrayList; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.window.WindowDefinition; + +@EqualsAndHashCode(callSuper = true) +@ToString +public class BufferPatternRowsWindowFrame extends PeerRowsWindowFrame { + + private final Expression sourceField; + + @Getter private final BrainLogParser logParser; + + private final List> preprocessedMessages; + + public BufferPatternRowsWindowFrame( + WindowDefinition windowDefinition, BrainLogParser logParser, Expression sourceField) { + super(windowDefinition); + this.logParser = logParser; + this.sourceField = sourceField; + this.preprocessedMessages = new ArrayList<>(); + } + + @Override + public void load(PeekingIterator it) { + if (hasNext()) { + return; + } + + loadAllRows(it); + + List logMessages = + peers.stream() + .map( + exprValue -> { + ExprValue value = sourceField.valueOf(exprValue.bindingTuples()); + return value.stringValue(); + }) + .toList(); + this.preprocessedMessages.addAll(logParser.preprocessAllLogs(logMessages)); + } + + public List currentPreprocessedMessage() { + return this.preprocessedMessages.get(position); + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java index a98826d333..b5dcfbc599 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java @@ -31,13 +31,13 @@ public class PeerRowsWindowFrame implements WindowFrame { * All peer rows (peer means rows in a partition that share same sort key based on sort list in * window definition. */ - private final List peers = new ArrayList<>(); + protected final List peers = new ArrayList<>(); /** Which row in the peer is currently being enriched by window function. */ - private int position; + protected int position; /** Does row at current position represents a new partition. */ - private boolean isNewPartition = true; + protected boolean isNewPartition = true; /** If any more pre-fetched rows not returned to window operator yet. */ @Override @@ -92,6 +92,15 @@ public void load(PeekingIterator it) { return; } + loadAllRows(it); + } + + @Override + public boolean isNewPartition() { + return isNewPartition; + } + + protected void loadAllRows(PeekingIterator it) { // Reset state: reset new partition before clearing peers isNewPartition = !isSamePartition(it.peek()); position = 0; @@ -109,12 +118,7 @@ public void load(PeekingIterator it) { } } - @Override - public boolean isNewPartition() { - return isNewPartition; - } - - private boolean isPeer(ExprValue next) { + protected boolean isPeer(ExprValue next) { List sortFields = windowDefinition.getSortList().stream().map(Pair::getRight).collect(Collectors.toList()); @@ -122,7 +126,7 @@ private boolean isPeer(ExprValue next) { return resolve(sortFields, last).equals(resolve(sortFields, next)); } - private boolean isSamePartition(ExprValue next) { + protected boolean isSamePartition(ExprValue next) { if (peers.isEmpty()) { return false; } diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java new file mode 100644 index 0000000000..76e36a24a5 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java @@ -0,0 +1,26 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.window.frame; + +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.expression.parse.PatternsExpression; +import org.opensearch.sql.expression.window.WindowDefinition; + +@Getter +@EqualsAndHashCode(callSuper = true) +@ToString +public class StreamPatternRowWindowFrame extends CurrentRowWindowFrame { + + final PatternsExpression patternsExpression; + + public StreamPatternRowWindowFrame( + WindowDefinition windowDefinition, PatternsExpression patternsExpression) { + super(windowDefinition); + this.patternsExpression = patternsExpression; + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java b/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java new file mode 100644 index 0000000000..2ed5b5346a --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java @@ -0,0 +1,88 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.window.patterns; + +import static org.opensearch.sql.utils.ExpressionUtils.format; + +import java.util.List; +import java.util.Locale; +import lombok.EqualsAndHashCode; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.data.type.ExprCoreType; +import org.opensearch.sql.data.type.ExprType; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.FunctionExpression; +import org.opensearch.sql.expression.NamedArgumentExpression; +import org.opensearch.sql.expression.env.Environment; +import org.opensearch.sql.expression.function.BuiltinFunctionName; +import org.opensearch.sql.expression.window.WindowDefinition; +import org.opensearch.sql.expression.window.WindowFunctionExpression; +import org.opensearch.sql.expression.window.frame.BufferPatternRowsWindowFrame; +import org.opensearch.sql.expression.window.frame.WindowFrame; + +@EqualsAndHashCode(callSuper = true) +public class BufferPatternWindowFunction extends FunctionExpression + implements WindowFunctionExpression { + + public BufferPatternWindowFunction(List arguments) { + super(BuiltinFunctionName.BRAIN.getName(), arguments); + } + + @Override + public WindowFrame createWindowFrame(WindowDefinition definition) { + int variableCountThreshold = + getArguments().stream() + .filter( + expression -> + expression instanceof NamedArgumentExpression + && ((NamedArgumentExpression) expression) + .getArgName() + .equalsIgnoreCase("variable_count_threshold")) + .map( + expression -> + ((NamedArgumentExpression) expression).getValue().valueOf().integerValue()) + .findFirst() + .orElse(5); + float thresholdPercentage = + getArguments().stream() + .filter( + expression -> + expression instanceof NamedArgumentExpression + && ((NamedArgumentExpression) expression) + .getArgName() + .equalsIgnoreCase("frequency_threshold_percentage")) + .map( + expression -> + ((NamedArgumentExpression) expression).getValue().valueOf().floatValue()) + .findFirst() + .orElse(0.3f); + return new BufferPatternRowsWindowFrame( + definition, + new BrainLogParser(variableCountThreshold, thresholdPercentage), + getArguments().get(0)); // actually only first argument is meaningful + } + + @Override + public ExprValue valueOf(Environment valueEnv) { + BufferPatternRowsWindowFrame frame = (BufferPatternRowsWindowFrame) valueEnv; + List preprocessedMessage = frame.currentPreprocessedMessage(); + frame.next(); + List logPattern = frame.getLogParser().parseLogPattern(preprocessedMessage); + return new ExprStringValue(String.join(" ", logPattern)); + } + + @Override + public ExprType type() { + return ExprCoreType.STRING; + } + + @Override + public String toString() { + return String.format(Locale.ROOT, "%s(%s)", getFunctionName(), format(getArguments())); + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java b/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java new file mode 100644 index 0000000000..1e88775caa --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java @@ -0,0 +1,80 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.sql.expression.window.patterns; + +import static org.opensearch.sql.utils.ExpressionUtils.format; + +import java.util.List; +import java.util.Locale; +import lombok.EqualsAndHashCode; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.data.type.ExprCoreType; +import org.opensearch.sql.data.type.ExprType; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.FunctionExpression; +import org.opensearch.sql.expression.LiteralExpression; +import org.opensearch.sql.expression.NamedArgumentExpression; +import org.opensearch.sql.expression.env.Environment; +import org.opensearch.sql.expression.function.BuiltinFunctionName; +import org.opensearch.sql.expression.parse.PatternsExpression; +import org.opensearch.sql.expression.window.WindowDefinition; +import org.opensearch.sql.expression.window.WindowFunctionExpression; +import org.opensearch.sql.expression.window.frame.StreamPatternRowWindowFrame; +import org.opensearch.sql.expression.window.frame.WindowFrame; + +@EqualsAndHashCode(callSuper = true) +public class StreamPatternWindowFunction extends FunctionExpression + implements WindowFunctionExpression { + + public StreamPatternWindowFunction(List arguments) { + super(BuiltinFunctionName.SIMPLE_PATTERN.getName(), arguments); + } + + @Override + public WindowFrame createWindowFrame(WindowDefinition definition) { + String pattern = + getArguments().stream() + .filter( + expression -> + expression instanceof NamedArgumentExpression + && ((NamedArgumentExpression) expression) + .getArgName() + .equalsIgnoreCase("pattern")) + .map( + expression -> + ((NamedArgumentExpression) expression).getValue().valueOf().stringValue()) + .findFirst() + .orElse(""); + return new StreamPatternRowWindowFrame( + definition, + new PatternsExpression( + getArguments().get(0), + new LiteralExpression(new ExprStringValue(pattern)), + new LiteralExpression(new ExprStringValue("")))); + } + + @Override + public ExprValue valueOf(Environment valueEnv) { + StreamPatternRowWindowFrame frame = (StreamPatternRowWindowFrame) valueEnv; + ExprValue sourceFieldValue = + frame.getPatternsExpression().getSourceField().valueOf(frame.current().bindingTuples()); + return frame.getPatternsExpression().parseValue(sourceFieldValue); + } + + @Override + public ExprType type() { + return ExprCoreType.STRING; + } + + @Override + public String toString() { + return String.format(Locale.ROOT, "%s(%s)", getFunctionName(), format(getArguments())); + } +} diff --git a/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java b/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java index e659cfdf50..55756b8cec 100644 --- a/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java +++ b/core/src/main/java/org/opensearch/sql/utils/ParseUtils.java @@ -14,7 +14,6 @@ import org.opensearch.sql.expression.Expression; import org.opensearch.sql.expression.parse.GrokExpression; import org.opensearch.sql.expression.parse.ParseExpression; -import org.opensearch.sql.expression.parse.PatternsExpression; import org.opensearch.sql.expression.parse.RegexExpression; /** Utils for {@link ParseExpression}. */ @@ -24,8 +23,7 @@ public class ParseUtils { private static final Map FACTORY_MAP = ImmutableMap.of( ParseMethod.REGEX, RegexExpression::new, - ParseMethod.GROK, GrokExpression::new, - ParseMethod.PATTERNS, PatternsExpression::new); + ParseMethod.GROK, GrokExpression::new); /** * Construct corresponding ParseExpression by {@link ParseMethod}. @@ -52,13 +50,8 @@ public static List getNamedGroupCandidates( switch (parseMethod) { case REGEX: return RegexExpression.getNamedGroupCandidates(pattern); - case GROK: - return GrokExpression.getNamedGroupCandidates(pattern); default: - return PatternsExpression.getNamedGroupCandidates( - arguments.containsKey(NEW_FIELD_KEY) - ? (String) arguments.get(NEW_FIELD_KEY).getValue() - : null); + return GrokExpression.getNamedGroupCandidates(pattern); } } diff --git a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java index 3f4752aa2e..aa3499bd44 100644 --- a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java +++ b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java @@ -1376,58 +1376,6 @@ public void parse_relation_with_regex_expression() { AstDSL.alias("string_value", qualifiedName("string_value")))); } - @Test - public void parse_relation_with_patterns_expression() { - Map arguments = - ImmutableMap.builder() - .put("new_field", AstDSL.stringLiteral("custom_field")) - .put("pattern", AstDSL.stringLiteral("custom_pattern")) - .build(); - - assertAnalyzeEqual( - LogicalPlanDSL.project( - LogicalPlanDSL.relation("schema", table), - ImmutableList.of(DSL.named("string_value", DSL.ref("string_value", STRING))), - ImmutableList.of( - DSL.named( - "custom_field", - DSL.patterns( - DSL.ref("string_value", STRING), - DSL.literal("custom_pattern"), - DSL.literal("custom_field"))))), - AstDSL.project( - AstDSL.parse( - AstDSL.relation("schema"), - ParseMethod.PATTERNS, - AstDSL.field("string_value"), - AstDSL.stringLiteral("custom_pattern"), - arguments), - AstDSL.alias("string_value", qualifiedName("string_value")))); - } - - @Test - public void parse_relation_with_patterns_expression_no_args() { - assertAnalyzeEqual( - LogicalPlanDSL.project( - LogicalPlanDSL.relation("schema", table), - ImmutableList.of(DSL.named("string_value", DSL.ref("string_value", STRING))), - ImmutableList.of( - DSL.named( - "patterns_field", - DSL.patterns( - DSL.ref("string_value", STRING), - DSL.literal(""), - DSL.literal("patterns_field"))))), - AstDSL.project( - AstDSL.parse( - AstDSL.relation("schema"), - ParseMethod.PATTERNS, - AstDSL.field("string_value"), - AstDSL.stringLiteral(""), - ImmutableMap.of()), - AstDSL.alias("string_value", qualifiedName("string_value")))); - } - @Test public void kmeanns_relation() { Map argumentMap = diff --git a/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java b/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java deleted file mode 100644 index 7237f0673b..0000000000 --- a/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.parse; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.mockito.Mockito.when; -import static org.opensearch.sql.config.TestConfig.STRING_TYPE_MISSING_VALUE_FIELD; -import static org.opensearch.sql.config.TestConfig.STRING_TYPE_NULL_VALUE_FIELD; -import static org.opensearch.sql.data.model.ExprValueUtils.LITERAL_NULL; -import static org.opensearch.sql.data.model.ExprValueUtils.stringValue; -import static org.opensearch.sql.data.type.ExprCoreType.BOOLEAN; -import static org.opensearch.sql.data.type.ExprCoreType.STRING; - -import org.junit.jupiter.api.DisplayNameGeneration; -import org.junit.jupiter.api.DisplayNameGenerator; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; -import org.opensearch.sql.data.model.ExprValue; -import org.opensearch.sql.exception.SemanticCheckException; -import org.opensearch.sql.expression.DSL; -import org.opensearch.sql.expression.Expression; -import org.opensearch.sql.expression.ExpressionTestBase; -import org.opensearch.sql.expression.env.Environment; - -@ExtendWith(MockitoExtension.class) -@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) -class PatternsExpressionTest extends ExpressionTestBase { - - @Mock Environment env; - - @Test - public void resolve_value() { - when(DSL.ref("log_value", STRING).valueOf(env)) - .thenReturn( - stringValue( - "145.128.75.121 - - [29/Aug/2022:13:26:44 -0700] \"GET /deliverables HTTP/2.0\" 501" - + " 2721")); - assertEquals( - stringValue("... - - [//::: -] \" / /.\" "), - DSL.patterns(DSL.ref("log_value", STRING), DSL.literal(""), DSL.literal("punct_field")) - .valueOf(env)); - assertEquals( - stringValue("... - - [/Aug/::: -] \"GET /deliverables HTTP/.\" "), - DSL.patterns(DSL.ref("log_value", STRING), DSL.literal("[0-9]"), DSL.literal("regex_field")) - .valueOf(env)); - } - - @Test - public void resolve_null_and_missing_values() { - assertEquals( - LITERAL_NULL, - DSL.patterns( - DSL.ref(STRING_TYPE_NULL_VALUE_FIELD, STRING), - DSL.literal("pattern"), - DSL.literal("patterns_field")) - .valueOf(valueEnv())); - assertEquals( - LITERAL_NULL, - DSL.patterns( - DSL.ref(STRING_TYPE_MISSING_VALUE_FIELD, STRING), - DSL.literal("pattern"), - DSL.literal("patterns_field")) - .valueOf(valueEnv())); - } - - @Test - public void resolve_type() { - assertEquals( - STRING, - DSL.patterns(DSL.ref("string_value", STRING), DSL.literal("pattern"), DSL.literal("group")) - .type()); - } - - @Test - public void throws_semantic_exception_if_value_type_is_not_string() { - assertThrows( - SemanticCheckException.class, - () -> - DSL.patterns( - DSL.ref("boolean_value", BOOLEAN), DSL.literal("pattern"), DSL.literal("group")) - .valueOf(valueEnv())); - } -} diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 8011532db5..bb640db6f7 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -37,6 +37,11 @@ AD: 'AD'; ML: 'ML'; FILLNULL: 'FILLNULL'; TRENDLINE: 'TRENDLINE'; +PATTERN_METHOD: 'PATTERN_METHOD'; +SIMPLE_PATTERN: 'SIMPLE_PATTERN'; +BRAIN: 'BRAIN'; +VARIABLE_COUNT_THRESHOLD: 'VARIABLE_COUNT_THRESHOLD'; +FREQUENCY_THRESHOLD_PERCENTAGE: 'FREQUENCY_THRESHOLD_PERCENTAGE'; // COMMAND ASSIST KEYWORDS AS: 'AS'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 27f7e4014b..ff2f602791 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -45,12 +45,12 @@ commands | rareCommand | grokCommand | parseCommand - | patternsCommand | kmeansCommand | adCommand | mlCommand | fillnullCommand | trendlineCommand + | patternsCommand ; searchCommand @@ -115,18 +115,25 @@ parseCommand : PARSE (source_field = expression) (pattern = stringLiteral) ; +patternsMethod + : PUNCT + | REGEX + ; + patternsCommand - : PATTERNS (patternsParameter)* (source_field = expression) + : PATTERNS (patternsParameter)* (source_field = expression) (pattern_method = patternMethod)* ; patternsParameter : (NEW_FIELD EQUAL new_field = stringLiteral) | (PATTERN EQUAL pattern = stringLiteral) + | (VARIABLE_COUNT_THRESHOLD EQUAL variable_count_threshold = integerLiteral) + | (FREQUENCY_THRESHOLD_PERCENTAGE EQUAL frequency_threshold_percentage = decimalLiteral) ; -patternsMethod - : PUNCT - | REGEX +patternMethod + : SIMPLE_PATTERN + | BRAIN ; fillnullCommand diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index c3c31ee2e1..980031f9be 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Locale; import java.util.Optional; import java.util.stream.Collectors; import lombok.RequiredArgsConstructor; @@ -39,7 +40,9 @@ import org.antlr.v4.runtime.tree.ParseTree; import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.Alias; +import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.Function; import org.opensearch.sql.ast.expression.Let; import org.opensearch.sql.ast.expression.Literal; import org.opensearch.sql.ast.expression.Map; @@ -47,6 +50,7 @@ import org.opensearch.sql.ast.expression.QualifiedName; import org.opensearch.sql.ast.expression.UnresolvedArgument; import org.opensearch.sql.ast.expression.UnresolvedExpression; +import org.opensearch.sql.ast.expression.WindowFunction; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.Aggregation; import org.opensearch.sql.ast.tree.Dedupe; @@ -57,6 +61,7 @@ import org.opensearch.sql.ast.tree.Kmeans; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Parse; +import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; @@ -67,6 +72,7 @@ import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.common.utils.StringUtils; +import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.AdCommandContext; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser.ByClauseContext; @@ -283,6 +289,8 @@ public UnresolvedPlan visitParseCommand(OpenSearchPPLParser.ParseCommandContext @Override public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandContext ctx) { UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); + List unresolvedArguments = new ArrayList<>(); + unresolvedArguments.add(sourceField); ImmutableMap.Builder builder = ImmutableMap.builder(); ctx.patternsParameter() .forEach( @@ -290,11 +298,26 @@ public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandCo builder.put( x.children.get(0).toString(), (Literal) internalVisitExpression(x.children.get(2))); + unresolvedArguments.add( + new Argument( + x.children.get(0).toString(), + (Literal) internalVisitExpression(x.children.get(2)))); }); java.util.Map arguments = builder.build(); - Literal pattern = arguments.getOrDefault("pattern", AstDSL.stringLiteral("")); - - return new Parse(ParseMethod.PATTERNS, sourceField, pattern, arguments); + Literal alias = arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")); + return new Pattern( + new Alias( + "patterns_field", + new WindowFunction( + new Function( + ctx.pattern_method != null + ? ctx.pattern_method.getText().toLowerCase(Locale.ROOT) + : BuiltinFunctionName.BRAIN.name(), // By default, use new algorithm + unresolvedArguments), + List.of(), // ignore partition by list for now as we haven't seen such requirement + List.of()), // ignore sort by list for now as we haven't seen such requirement + (String) alias.getValue()), + arguments); } /** Top command. */ diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index c6f4ed2044..683da11657 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -608,33 +608,6 @@ public void testParseCommand() { ImmutableMap.of())); } - @Test - public void testPatternsCommand() { - assertEqual( - "source=t | patterns new_field=\"custom_field\" " + "pattern=\"custom_pattern\" raw", - parse( - relation("t"), - ParseMethod.PATTERNS, - field("raw"), - stringLiteral("custom_pattern"), - ImmutableMap.builder() - .put("new_field", stringLiteral("custom_field")) - .put("pattern", stringLiteral("custom_pattern")) - .build())); - } - - @Test - public void testPatternsCommandWithoutArguments() { - assertEqual( - "source=t | patterns raw", - parse( - relation("t"), - ParseMethod.PATTERNS, - field("raw"), - stringLiteral(""), - ImmutableMap.of())); - } - @Test public void testKmeansCommand() { assertEqual( From 6874a5f643da4d7006bed3bf130dc1ddf4aefbd4 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Sun, 26 Jan 2025 16:29:31 +0800 Subject: [PATCH 2/6] Minor change log parser default configurations Signed-off-by: Songkan Tang --- .../opensearch/sql/common/patterns/BrainLogParser.java | 8 ++++---- .../org/opensearch/sql/ast/expression/PatternMethod.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java index dfa0d17779..6553041dd1 100644 --- a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java +++ b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java @@ -39,11 +39,11 @@ public class BrainLogParser { "<*DATETIME*>"); // Hex Decimal, letters followed by digits, float numbers DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( - Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(\\d+(\\.\\d*)?|\\.\\d+))"), + Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(?!\\d{3}$)\\d{4,}(\\.\\d*)?|\\.\\d+)"), VARIABLE_DENOTER); // generic number surrounded by non-alphanumeric DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( - Pattern.compile("(?<=[^A-Za-z0-9])(-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$"), VARIABLE_DENOTER); + Pattern.compile("(?<=[^A-Za-z0-9 ])(-?\\+?\\d+)(?=[^A-Za-z0-9])"), VARIABLE_DENOTER); } private static final List DEFAULT_DELIMITERS = List.of(",", "+"); @@ -54,12 +54,12 @@ public class BrainLogParser { private static final String GROUP_TOKEN_SET_KEY_FORMAT = "%d-%s-%d"; // By default, algorithm treats more than 2 different tokens in the group per position as variable // token - private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 2; + private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 5; /* * By default, algorithm treats the longest word combinations as the group root, no matter what its frequency is. * Otherwise, the longest word combination will be selected when frequency >= highest frequency of log * threshold percentage */ - private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.0f; + private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.3f; private final Map tokenFreqMap; private final Map> groupTokenSetMap; diff --git a/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java index 8eb0080cdf..19029ad989 100644 --- a/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java +++ b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java @@ -13,7 +13,7 @@ @RequiredArgsConstructor public enum PatternMethod { - SIMPLE("simple"), + SIMPLE("simple_pattern"), BRAIN("brain"); @Getter final String name; From c62b25bbfabec77b8f63661908f4396a317792bb Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Tue, 28 Jan 2025 16:12:48 +0800 Subject: [PATCH 3/6] Refactor a bit and add partial unit tests Signed-off-by: Songkan Tang --- .../sql/common/patterns/BrainLogParser.java | 53 +++- .../common/patterns/BrainLogParserTest.java | 273 ++++++++++++++++++ .../org/opensearch/sql/analysis/Analyzer.java | 21 +- .../sql/analysis/ExpressionAnalyzer.java | 6 + .../sql/ast/AbstractNodeVisitor.java | 10 +- .../org/opensearch/sql/ast/dsl/AstDSL.java | 20 +- .../sql/ast/expression/PatternMethod.java | 7 +- .../ast/tree/{Pattern.java => Window.java} | 17 +- .../expression/window/WindowFunctions.java | 30 +- .../frame/BufferPatternRowsWindowFrame.java | 3 +- .../window/frame/PeerRowsWindowFrame.java | 16 +- .../frame/StreamPatternRowWindowFrame.java | 26 -- .../patterns/BufferPatternWindowFunction.java | 28 +- .../patterns/StreamPatternWindowFunction.java | 39 +-- .../opensearch/sql/utils/FunctionUtils.java | 28 ++ .../opensearch/sql/ppl/parser/AstBuilder.java | 35 ++- .../sql/ppl/parser/AstBuilderTest.java | 38 +++ 17 files changed, 490 insertions(+), 160 deletions(-) create mode 100644 common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java rename core/src/main/java/org/opensearch/sql/ast/tree/{Pattern.java => Window.java} (63%) delete mode 100644 core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java create mode 100644 core/src/main/java/org/opensearch/sql/utils/FunctionUtils.java diff --git a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java index 6553041dd1..a380977535 100644 --- a/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java +++ b/common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java @@ -14,6 +14,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.OptionalLong; import java.util.Set; import java.util.regex.Pattern; @@ -39,7 +40,8 @@ public class BrainLogParser { "<*DATETIME*>"); // Hex Decimal, letters followed by digits, float numbers DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( - Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(?!\\d{3}$)\\d{4,}(\\.\\d*)?|\\.\\d+)"), + Pattern.compile( + "((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(?!\\d{3}$)\\d{4,}(\\.\\d*)?|\\.\\d+)"), VARIABLE_DENOTER); // generic number surrounded by non-alphanumeric DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( @@ -214,18 +216,18 @@ public List> preprocessAllLogs(List logMessages) { * * @param preprocessedLogs preprocessed list of log messages */ - private void calculateGroupTokenFreq(List> preprocessedLogs) { + void calculateGroupTokenFreq(List> preprocessedLogs) { for (List tokens : preprocessedLogs) { Map wordOccurrences = this.getWordOccurrences(tokens); List sortedWordCombinations = wordOccurrences.entrySet().stream() .map(entry -> new WordCombination(entry.getKey(), entry.getValue())) .sorted() - .toList(); + .collect(Collectors.toList()); WordCombination candidate = this.findCandidate(sortedWordCombinations); String groupCandidateStr = String.format(Locale.ROOT, "%d,%d", candidate.wordFreq(), candidate.sameFreqCount()); - this.logIdGroupCandidateMap.put(tokens.getLast(), groupCandidateStr); + this.logIdGroupCandidateMap.put(tokens.get(tokens.size() - 1), groupCandidateStr); this.updateGroupTokenFreqMap(tokens, groupCandidateStr); } } @@ -238,7 +240,7 @@ private void calculateGroupTokenFreq(List> preprocessedLogs) { * @return parsed log pattern that is a list of string */ public List parseLogPattern(List tokens) { - String logId = tokens.getLast(); + String logId = tokens.get(tokens.size() - 1); String groupCandidateStr = this.logIdGroupCandidateMap.get(logId); String[] groupCandidate = groupCandidateStr.split(","); Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group @@ -299,7 +301,7 @@ public Map> parseAllLogPatterns(List logMessages) { Map> logPatternMap = new HashMap<>(); for (List processedMessage : processedMessages) { - String logId = processedMessage.getLast(); + String logId = processedMessage.get(processedMessage.size() - 1); List logPattern = this.parseLogPattern(processedMessage); String patternKey = String.join(" ", logPattern); logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId); @@ -357,7 +359,7 @@ private WordCombination findCandidate(List sortedWordCombinatio return wordCombination; } } - return sortedWordCombinations.getFirst(); + return sortedWordCombinations.get(0); } private void updateGroupTokenFreqMap(List tokens, String groupCandidateStr) { @@ -371,8 +373,22 @@ private void updateGroupTokenFreqMap(List tokens, String groupCandidateS } } - private record WordCombination(Long wordFreq, Integer sameFreqCount) - implements Comparable { + private static final class WordCombination implements Comparable { + private final Long wordFreq; + private final Integer sameFreqCount; + + public WordCombination(Long wordFreq, Integer sameFreqCount) { + this.wordFreq = wordFreq; + this.sameFreqCount = sameFreqCount; + } + + public Long wordFreq() { + return this.wordFreq; + } + + public Integer sameFreqCount() { + return this.sameFreqCount; + } @Override public int compareTo(WordCombination other) { @@ -381,9 +397,26 @@ public int compareTo(WordCombination other) { if (wordFreqComparison != 0) { return wordFreqComparison; } - // If sameFreqCount are the same, compare by wordFreq in descending order return other.wordFreq.compareTo(this.wordFreq); } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + WordCombination other = (WordCombination) obj; + return Objects.equals(wordFreq, other.wordFreq) + && Objects.equals(sameFreqCount, other.sameFreqCount); + } + + @Override + public int hashCode() { + return Objects.hash(wordFreq, sameFreqCount); + } } } diff --git a/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java b/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java new file mode 100644 index 0000000000..78a69269c2 --- /dev/null +++ b/common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java @@ -0,0 +1,273 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.common.patterns; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import org.junit.Before; +import org.junit.FixMethodOrder; +import org.junit.Test; +import org.junit.runners.MethodSorters; + +@FixMethodOrder(MethodSorters.NAME_ASCENDING) +public class BrainLogParserTest { + + private static final List TEST_HDFS_LOGS = + Arrays.asList( + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31.85:50010 is added to" + + " blk_-7017553867379051457 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296." + + " blk_-6620182933895093708", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.244:50010 is added to" + + " blk_-6956067134432991406 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000230_0/part-00230." + + " blk_559204981722276126", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000169_0/part-00169." + + " blk_-7105305952901940477", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.107.19:50010 is added to" + + " blk_-3249711809227781266 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318." + + " blk_-207775976836691685", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.6.4:50010 is added to" + + " blk_5114010683183383297 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318." + + " blk_2096692261399680562", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.15.240:50010 is added to" + + " blk_-1055254430948037872 size 67108864", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.146:50010 is added to" + + " blk_278357163850888 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_200811092030_0002_r_000138_0/part-00138." + + " blk_-210021574616486609", + "Verification succeeded for blk_-1547954353065580372", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.39.242:50010 is added to" + + " blk_-4110733372292809607 size 67108864", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/randtxt/_temporary/_task_200811092030_0003_m_000382_0/part-00382." + + " blk_8935202950442998446", + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/randtxt/_temporary/_task_200811092030_0003_m_000392_0/part-00392." + + " blk_-3010126661650043258", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.25.237:50010 is added to" + + " blk_541463031152673662 size 67108864", + "Verification succeeded for blk_6996194389878584395", + "PacketResponder failed for blk_6996194389878584395", + "PacketResponder failed for blk_-1547954353065580372"); + private BrainLogParser parser; + + @Before + public void setUp() throws Exception { + parser = new BrainLogParser(); + } + + @Test + public void testNewParserWithIllegalArgument() { + String exceptionMessage = "Threshold percentage must be between 0.0 and 1.0"; + Throwable throwable = + assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, -1.0f)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, 1.1f)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + + @Test + public void testPreprocess() { + String logMessage = "127.0.0.1 - 1234 something"; + String logId = "log1"; + List expectedResult = Arrays.asList("<*IP*>", "-", "<*>", "something", "log1"); + List result = parser.preprocess(logMessage, logId); + assertEquals(expectedResult, result); + // Test with different delimiter + logMessage = "127.0.0.1=1234 something"; + logId = "log2"; + expectedResult = Arrays.asList("<*IP*>=<*>", "something", "log2"); + result = parser.preprocess(logMessage, logId); + assertEquals(expectedResult, result); + } + + @Test + public void testPreprocessWithIllegalInput() { + String logMessage = "127.0.0.1 - 1234 something"; + String logId = "log1"; + String exceptionMessage = "log message or logId must not be null"; + Throwable throwable = + assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = + assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + + @Test + public void testPreprocessAllLogs() { + List logMessages = + Arrays.asList("127.0.0.1 - 1234 something", "192.168.0.1 - 5678 something_else"); + List> result = parser.preprocessAllLogs(logMessages); + assertEquals(2, result.size()); + assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something", "0"), result.get(0)); + assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something_else", "1"), result.get(1)); + } + + @Test + public void testProcessTokenHistogram() { + String something = String.format(Locale.ROOT, "%d-%s", 0, "something"); + String up = String.format(Locale.ROOT, "%d-%s", 1, "up"); + List firstTokens = Arrays.asList("something", "up", "0"); + parser.processTokenHistogram(firstTokens); + assertEquals(1L, parser.getTokenFreqMap().get(something).longValue()); + assertEquals(1L, parser.getTokenFreqMap().get(up).longValue()); + List secondTokens = Arrays.asList("something", "down", "1"); + parser.processTokenHistogram(secondTokens); + assertEquals(2L, parser.getTokenFreqMap().get(something).longValue()); + assertEquals(1L, parser.getTokenFreqMap().get(up).longValue()); + } + + @Test + public void testCalculateGroupTokenFreq() { + List logMessages = + Arrays.asList( + "127.0.0.1 - 1234 something", + "192.168.0.1:5678 something_else", + "0.0.0.0:42 something_else"); + List logIds = Arrays.asList("0", "1", "2"); + List> preprocessedLogs = parser.preprocessAllLogs(logMessages); + for (String logId : logIds) { + String groupCandidate = parser.getLogIdGroupCandidateMap().get(logId); + assertNotNull(groupCandidate); + } + assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something"))); + assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something_else"))); + String sampleGroupTokenKey = + String.format(Locale.ROOT, "%d-%s-%d", 4, parser.getLogIdGroupCandidateMap().get("0"), 3); + assertTrue(parser.getGroupTokenSetMap().get(sampleGroupTokenKey).contains("something")); + } + + @Test + public void testCalculateGroupTokenFreqWithIllegalInput() { + List> preprocessedLogs = Arrays.asList(List.of()); + String exceptionMessage = "Sorted word combinations must be non empty"; + Throwable throwable = + assertThrows( + IllegalArgumentException.class, () -> parser.calculateGroupTokenFreq(preprocessedLogs)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + + @Test + public void testParseLogPattern() { + List> preprocessedLogs = parser.preprocessAllLogs(TEST_HDFS_LOGS); + List expectedLogPattern = + Arrays.asList( + "BLOCK*", + "NameSystem.addStoredBlock:", + "blockMap", + "updated:", + "<*IP*>", + "is", + "added", + "to", + "blk_<*>", + "size", + "<*>"); + List logPattern = parser.parseLogPattern(preprocessedLogs.get(0)); + assertEquals(expectedLogPattern, logPattern); + } + + @Test + public void testParseAllLogPatterns() { + Map> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS); + Map expectedResult = + Map.of( + "PacketResponder failed for blk_<*>", + 2, + "Verification succeeded for blk_<*>", + 2, + "BLOCK* NameSystem.addStoredBlock: blockMap updated: <*IP*> is added to blk_<*> size" + + " <*>", + 8, + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/sortrand/_temporary/_task_<*>_<*>_r_<*>_<*>/part<*> blk_<*>", + 6, + "BLOCK* NameSystem.allocateBlock:" + + " /user/root/randtxt/_temporary/_task_<*>_<*>_m_<*>_<*>/part<*> blk_<*>", + 2); + Map logPatternByCountMap = + logPatternMap.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().size())); + assertEquals(expectedResult, logPatternByCountMap); + } + + @Test + public void testParseLogPatternWhenLowerFrequencyTokenIsVariable() { + int testVariableCountThreshold = 3; + parser = new BrainLogParser(testVariableCountThreshold, 0.0f); + List logMessages = + Arrays.asList( + "Verification succeeded a blk_-1547954353065580372", + "Verification succeeded b blk_6996194389878584395", + "Verification succeeded c blk_6996194389878584395", + "Verification succeeded d blk_6996194389878584395"); + Map> expectedResult = + Map.of("Verification succeeded <*> blk_<*>", Arrays.asList("0", "1", "2", "3")); + Map> logPatternMap = parser.parseAllLogPatterns(logMessages); + assertEquals(expectedResult, logPatternMap); + /* + * 'a', 'b', 'c' and 'd' token is on the 3rd position in the group 2,3, their frequency is lower than + * representative frequency. Since that position's distinct token number exceeds the variable count threshold, + * the third position in this log group is treated as variable + */ + assertTrue( + parser.getTokenFreqMap().get("2-a") < parser.getTokenFreqMap().get("0-Verification")); + assertTrue( + parser.getTokenFreqMap().get("2-b") < parser.getTokenFreqMap().get("0-Verification")); + assertTrue(testVariableCountThreshold <= parser.getGroupTokenSetMap().get("4-4,3-2").size()); + } + + @Test + public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() { + List logMessages = + Arrays.asList( + "Verification succeeded for blk_-1547954353065580372", + "Verification succeeded for blk_6996194389878584395", + "Test succeeded for blk_6996194389878584395", + "Verification", + "Verification"); + Map> expectedResult = + Map.of( + "<*> succeeded for blk_<*>", + Arrays.asList("0", "1"), + "Test succeeded for blk_<*>", + Arrays.asList("2"), + "Verification", + Arrays.asList("3", "4")); + Map> logPatternMap = parser.parseAllLogPatterns(logMessages); + assertEquals(expectedResult, logPatternMap); + /* + * 'Verification' and 'Test' token is on the 1st position in the group 3,3, 'Verification' frequency is higher than + * representative frequency because there are other groups which have 'Verification' token on the 1st position as well. + * Since first position's distinct token number is not unique, 'Verification' is treated as variable eventually. + */ + assertTrue( + parser.getTokenFreqMap().get("0-Verification") + > parser.getTokenFreqMap().get("1-succeeded")); + assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1); + } +} diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index d3e6e666eb..5f6f8b1bf2 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -36,7 +36,6 @@ import org.opensearch.sql.analysis.symbol.Namespace; import org.opensearch.sql.analysis.symbol.Symbol; import org.opensearch.sql.ast.AbstractNodeVisitor; -import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Let; @@ -59,7 +58,6 @@ import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; -import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.Relation; @@ -71,6 +69,7 @@ import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.ast.tree.Values; +import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.data.model.ExprMissingValue; import org.opensearch.sql.data.type.ExprCoreType; @@ -475,25 +474,17 @@ public LogicalPlan visitParse(Parse node, AnalysisContext context) { } @Override - public LogicalPlan visitPattern(Pattern node, AnalysisContext context) { + public LogicalPlan visitWindow(Window node, AnalysisContext context) { LogicalPlan child = node.getChild().get(0).accept(this, context); WindowExpressionAnalyzer windowAnalyzer = new WindowExpressionAnalyzer(expressionAnalyzer, child); - child = windowAnalyzer.analyze(node.getPatternWindowFunction(), context); - java.util.Map arguments = node.getArguments(); - Literal alias = arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")); + child = windowAnalyzer.analyze(node.getWindowFunction(), context); TypeEnvironment curEnv = context.peek(); - if (child instanceof LogicalWindow patternWindow) { - NamedExpression namedExpression = - new NamedExpression( - patternWindow.getWindowFunction().getNameOrAlias(), - new ReferenceExpression( - patternWindow.getWindowFunction().getNameOrAlias(), - patternWindow.getWindowFunction().getDelegated().type())); + if (child instanceof LogicalWindow window) { curEnv.define( - new Symbol(Namespace.FIELD_NAME, namedExpression.getNameOrAlias()), - namedExpression.type()); + new Symbol(Namespace.FIELD_NAME, window.getWindowFunction().getNameOrAlias()), + window.getWindowFunction().getDelegated().type()); } return child; diff --git a/core/src/main/java/org/opensearch/sql/analysis/ExpressionAnalyzer.java b/core/src/main/java/org/opensearch/sql/analysis/ExpressionAnalyzer.java index eab0eff03c..fa2478d53a 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/ExpressionAnalyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/ExpressionAnalyzer.java @@ -24,6 +24,7 @@ import org.opensearch.sql.ast.expression.AggregateFunction; import org.opensearch.sql.ast.expression.AllFields; import org.opensearch.sql.ast.expression.And; +import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Between; import org.opensearch.sql.ast.expression.Case; import org.opensearch.sql.ast.expression.Cast; @@ -400,6 +401,11 @@ public Expression visitUnresolvedArgument(UnresolvedArgument node, AnalysisConte return new NamedArgumentExpression(node.getArgName(), node.getValue().accept(this, context)); } + @Override + public Expression visitArgument(Argument node, AnalysisContext context) { + return new NamedArgumentExpression(node.getArgName(), node.getValue().accept(this, context)); + } + /** * If QualifiedName is actually a reserved metadata field, return the expr type associated with * the metadata field. diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index 5b2e9f35b4..7971e37921 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -53,7 +53,6 @@ import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; -import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.Relation; @@ -63,6 +62,7 @@ import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.Values; +import org.opensearch.sql.ast.tree.Window; /** AST nodes visitor Defines the traverse path. */ public abstract class AbstractNodeVisitor { @@ -216,10 +216,6 @@ public T visitParse(Parse node, C context) { return visitChildren(node, context); } - public T visitPattern(Pattern node, C context) { - return visitChildren(node, context); - } - public T visitLet(Let node, C context) { return visitChildren(node, context); } @@ -331,4 +327,8 @@ public T visitCloseCursor(CloseCursor closeCursor, C context) { public T visitFillNull(FillNull fillNull, C context) { return visitChildren(fillNull, context); } + + public T visitWindow(Window window, C context) { + return visitChildren(window, context); + } } diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index 1c99697ad4..41400a426b 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -6,8 +6,10 @@ package org.opensearch.sql.ast.dsl; import com.google.common.collect.ImmutableList; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Locale; import java.util.Optional; import java.util.stream.Collectors; import lombok.experimental.UtilityClass; @@ -55,7 +57,6 @@ import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Limit; import org.opensearch.sql.ast.tree.Parse; -import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; @@ -68,6 +69,7 @@ import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.ast.tree.Values; +import org.opensearch.sql.ast.tree.Window; /** Class of static methods to create specific node instances. */ @UtilityClass @@ -491,19 +493,23 @@ public static Parse parse( return new Parse(parseMethod, sourceField, pattern, arguments, input); } - public static Pattern pattern( + public static Window window( UnresolvedPlan input, PatternMethod patternMethod, UnresolvedExpression sourceField, String alias, - java.util.Map arguments) { - return new Pattern( + List arguments) { + List funArgs = new ArrayList<>(); + funArgs.add(sourceField); + funArgs.addAll(arguments); + return new Window( new Alias( - "patterns_field", + alias, new WindowFunction( - new Function(patternMethod.getName(), List.of(sourceField)), List.of(), List.of()), + new Function(patternMethod.name().toLowerCase(Locale.ROOT), funArgs), + List.of(), + List.of()), alias), - arguments, input); } diff --git a/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java index 19029ad989..e75210f9fa 100644 --- a/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java +++ b/core/src/main/java/org/opensearch/sql/ast/expression/PatternMethod.java @@ -1,9 +1,6 @@ /* + * Copyright OpenSearch Contributors * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. */ package org.opensearch.sql.ast.expression; @@ -13,7 +10,7 @@ @RequiredArgsConstructor public enum PatternMethod { - SIMPLE("simple_pattern"), + SIMPLE_PATTERN("simple_pattern"), BRAIN("brain"); @Getter final String name; diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java b/core/src/main/java/org/opensearch/sql/ast/tree/Window.java similarity index 63% rename from core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java rename to core/src/main/java/org/opensearch/sql/ast/tree/Window.java index 08bf670159..d2c8fe5a8e 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/Pattern.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/Window.java @@ -1,16 +1,12 @@ /* + * Copyright OpenSearch Contributors * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. */ package org.opensearch.sql.ast.tree; import com.google.common.collect.ImmutableList; import java.util.List; -import java.util.Map; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -18,7 +14,6 @@ import lombok.Setter; import lombok.ToString; import org.opensearch.sql.ast.AbstractNodeVisitor; -import org.opensearch.sql.ast.expression.Literal; import org.opensearch.sql.ast.expression.UnresolvedExpression; @Getter @@ -27,16 +22,14 @@ @EqualsAndHashCode(callSuper = false) @RequiredArgsConstructor @AllArgsConstructor -public class Pattern extends UnresolvedPlan { +public class Window extends UnresolvedPlan { - private final UnresolvedExpression patternWindowFunction; - - private final Map arguments; + private final UnresolvedExpression windowFunction; private UnresolvedPlan child; @Override - public Pattern attach(UnresolvedPlan child) { + public Window attach(UnresolvedPlan child) { this.child = child; return this; } @@ -48,6 +41,6 @@ public List getChild() { @Override public T accept(AbstractNodeVisitor nodeVisitor, C context) { - return nodeVisitor.visitPattern(this, context); + return nodeVisitor.visitWindow(this, context); } } diff --git a/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java b/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java index 6a44d28521..d2de8302a9 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/WindowFunctions.java @@ -6,10 +6,12 @@ package org.opensearch.sql.expression.window; import static java.util.Collections.emptyList; +import static org.opensearch.sql.data.type.ExprCoreType.DOUBLE; +import static org.opensearch.sql.data.type.ExprCoreType.INTEGER; import static org.opensearch.sql.data.type.ExprCoreType.STRING; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import java.util.Collections; import java.util.function.Supplier; import lombok.experimental.UtilityClass; import org.opensearch.sql.expression.function.BuiltinFunctionName; @@ -56,22 +58,36 @@ private DefaultFunctionResolver denseRank() { private DefaultFunctionResolver brain() { FunctionName functionName = BuiltinFunctionName.BRAIN.getName(); - FunctionSignature functionSignature = - new FunctionSignature(functionName, Collections.singletonList(STRING)); FunctionBuilder functionBuilder = (functionProperties, arguments) -> new BufferPatternWindowFunction(arguments); return new DefaultFunctionResolver( - functionName, ImmutableMap.of(functionSignature, functionBuilder)); + functionName, + ImmutableMap.of( + new FunctionSignature(functionName, ImmutableList.of(STRING)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, INTEGER)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, DOUBLE)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING, INTEGER)), + functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING, DOUBLE)), + functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, INTEGER, DOUBLE)), + functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING, INTEGER, DOUBLE)), + functionBuilder)); } private DefaultFunctionResolver simplePattern() { FunctionName functionName = BuiltinFunctionName.SIMPLE_PATTERN.getName(); - FunctionSignature functionSignature = - new FunctionSignature(functionName, Collections.singletonList(STRING)); FunctionBuilder functionBuilder = (functionProperties, arguments) -> new StreamPatternWindowFunction(arguments); return new DefaultFunctionResolver( - functionName, ImmutableMap.of(functionSignature, functionBuilder)); + functionName, + ImmutableMap.of( + new FunctionSignature(functionName, ImmutableList.of(STRING)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING)), functionBuilder, + new FunctionSignature(functionName, ImmutableList.of(STRING, STRING, STRING)), + functionBuilder)); } private DefaultFunctionResolver rankingFunction( diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java index 5212d08f63..139d167d19 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrame.java @@ -11,6 +11,7 @@ import com.google.common.collect.PeekingIterator; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.ToString; @@ -52,7 +53,7 @@ public void load(PeekingIterator it) { ExprValue value = sourceField.valueOf(exprValue.bindingTuples()); return value.stringValue(); }) - .toList(); + .collect(Collectors.toList()); this.preprocessedMessages.addAll(logParser.preprocessAllLogs(logMessages)); } diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java index b5dcfbc599..adda2e2108 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/frame/PeerRowsWindowFrame.java @@ -37,7 +37,7 @@ public class PeerRowsWindowFrame implements WindowFrame { protected int position; /** Does row at current position represents a new partition. */ - protected boolean isNewPartition = true; + private boolean isNewPartition = true; /** If any more pre-fetched rows not returned to window operator yet. */ @Override @@ -95,11 +95,6 @@ public void load(PeekingIterator it) { loadAllRows(it); } - @Override - public boolean isNewPartition() { - return isNewPartition; - } - protected void loadAllRows(PeekingIterator it) { // Reset state: reset new partition before clearing peers isNewPartition = !isSamePartition(it.peek()); @@ -118,7 +113,12 @@ protected void loadAllRows(PeekingIterator it) { } } - protected boolean isPeer(ExprValue next) { + @Override + public boolean isNewPartition() { + return isNewPartition; + } + + private boolean isPeer(ExprValue next) { List sortFields = windowDefinition.getSortList().stream().map(Pair::getRight).collect(Collectors.toList()); @@ -126,7 +126,7 @@ protected boolean isPeer(ExprValue next) { return resolve(sortFields, last).equals(resolve(sortFields, next)); } - protected boolean isSamePartition(ExprValue next) { + private boolean isSamePartition(ExprValue next) { if (peers.isEmpty()) { return false; } diff --git a/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java b/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java deleted file mode 100644 index 76e36a24a5..0000000000 --- a/core/src/main/java/org/opensearch/sql/expression/window/frame/StreamPatternRowWindowFrame.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.window.frame; - -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; -import org.opensearch.sql.expression.parse.PatternsExpression; -import org.opensearch.sql.expression.window.WindowDefinition; - -@Getter -@EqualsAndHashCode(callSuper = true) -@ToString -public class StreamPatternRowWindowFrame extends CurrentRowWindowFrame { - - final PatternsExpression patternsExpression; - - public StreamPatternRowWindowFrame( - WindowDefinition windowDefinition, PatternsExpression patternsExpression) { - super(windowDefinition); - this.patternsExpression = patternsExpression; - } -} diff --git a/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java b/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java index 2ed5b5346a..aaa9e4d29e 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/patterns/BufferPatternWindowFunction.java @@ -17,13 +17,13 @@ import org.opensearch.sql.data.type.ExprType; import org.opensearch.sql.expression.Expression; import org.opensearch.sql.expression.FunctionExpression; -import org.opensearch.sql.expression.NamedArgumentExpression; import org.opensearch.sql.expression.env.Environment; import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.expression.window.WindowDefinition; import org.opensearch.sql.expression.window.WindowFunctionExpression; import org.opensearch.sql.expression.window.frame.BufferPatternRowsWindowFrame; import org.opensearch.sql.expression.window.frame.WindowFrame; +import org.opensearch.sql.utils.FunctionUtils; @EqualsAndHashCode(callSuper = true) public class BufferPatternWindowFunction extends FunctionExpression @@ -36,30 +36,12 @@ public BufferPatternWindowFunction(List arguments) { @Override public WindowFrame createWindowFrame(WindowDefinition definition) { int variableCountThreshold = - getArguments().stream() - .filter( - expression -> - expression instanceof NamedArgumentExpression - && ((NamedArgumentExpression) expression) - .getArgName() - .equalsIgnoreCase("variable_count_threshold")) - .map( - expression -> - ((NamedArgumentExpression) expression).getValue().valueOf().integerValue()) - .findFirst() + FunctionUtils.getNamedArgumentValue(getArguments(), "variable_count_threshold") + .map(ExprValue::integerValue) .orElse(5); float thresholdPercentage = - getArguments().stream() - .filter( - expression -> - expression instanceof NamedArgumentExpression - && ((NamedArgumentExpression) expression) - .getArgName() - .equalsIgnoreCase("frequency_threshold_percentage")) - .map( - expression -> - ((NamedArgumentExpression) expression).getValue().valueOf().floatValue()) - .findFirst() + FunctionUtils.getNamedArgumentValue(getArguments(), "frequency_threshold_percentage") + .map(ExprValue::floatValue) .orElse(0.3f); return new BufferPatternRowsWindowFrame( definition, diff --git a/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java b/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java index 1e88775caa..2af7b8ec11 100644 --- a/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java +++ b/core/src/main/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunction.java @@ -20,52 +20,45 @@ import org.opensearch.sql.expression.Expression; import org.opensearch.sql.expression.FunctionExpression; import org.opensearch.sql.expression.LiteralExpression; -import org.opensearch.sql.expression.NamedArgumentExpression; import org.opensearch.sql.expression.env.Environment; import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.expression.parse.PatternsExpression; import org.opensearch.sql.expression.window.WindowDefinition; import org.opensearch.sql.expression.window.WindowFunctionExpression; -import org.opensearch.sql.expression.window.frame.StreamPatternRowWindowFrame; +import org.opensearch.sql.expression.window.frame.CurrentRowWindowFrame; import org.opensearch.sql.expression.window.frame.WindowFrame; +import org.opensearch.sql.utils.FunctionUtils; @EqualsAndHashCode(callSuper = true) public class StreamPatternWindowFunction extends FunctionExpression implements WindowFunctionExpression { + private final PatternsExpression patternsExpression; + public StreamPatternWindowFunction(List arguments) { super(BuiltinFunctionName.SIMPLE_PATTERN.getName(), arguments); - } - - @Override - public WindowFrame createWindowFrame(WindowDefinition definition) { String pattern = - getArguments().stream() - .filter( - expression -> - expression instanceof NamedArgumentExpression - && ((NamedArgumentExpression) expression) - .getArgName() - .equalsIgnoreCase("pattern")) - .map( - expression -> - ((NamedArgumentExpression) expression).getValue().valueOf().stringValue()) - .findFirst() + FunctionUtils.getNamedArgumentValue(getArguments(), "pattern") + .map(ExprValue::stringValue) .orElse(""); - return new StreamPatternRowWindowFrame( - definition, + this.patternsExpression = new PatternsExpression( getArguments().get(0), new LiteralExpression(new ExprStringValue(pattern)), - new LiteralExpression(new ExprStringValue("")))); + new LiteralExpression(new ExprStringValue(""))); + } + + @Override + public WindowFrame createWindowFrame(WindowDefinition definition) { + return new CurrentRowWindowFrame(definition); } @Override public ExprValue valueOf(Environment valueEnv) { - StreamPatternRowWindowFrame frame = (StreamPatternRowWindowFrame) valueEnv; + CurrentRowWindowFrame frame = (CurrentRowWindowFrame) valueEnv; ExprValue sourceFieldValue = - frame.getPatternsExpression().getSourceField().valueOf(frame.current().bindingTuples()); - return frame.getPatternsExpression().parseValue(sourceFieldValue); + patternsExpression.getSourceField().valueOf(frame.current().bindingTuples()); + return patternsExpression.parseValue(sourceFieldValue); } @Override diff --git a/core/src/main/java/org/opensearch/sql/utils/FunctionUtils.java b/core/src/main/java/org/opensearch/sql/utils/FunctionUtils.java new file mode 100644 index 0000000000..425cd03f12 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/utils/FunctionUtils.java @@ -0,0 +1,28 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.utils; + +import java.util.List; +import java.util.Optional; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.NamedArgumentExpression; + +public final class FunctionUtils { + + public static Optional getNamedArgumentValue( + List arguments, String argName) { + return arguments.stream() + .filter( + expression -> + expression instanceof NamedArgumentExpression + && ((NamedArgumentExpression) expression) + .getArgName() + .equalsIgnoreCase(argName)) + .map(expression -> ((NamedArgumentExpression) expression).getValue().valueOf()) + .findFirst(); + } +} diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 980031f9be..415b2bb17d 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -33,12 +33,12 @@ import java.util.List; import java.util.Locale; import java.util.Optional; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import lombok.RequiredArgsConstructor; import org.antlr.v4.runtime.ParserRuleContext; import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.tree.ParseTree; -import org.opensearch.sql.ast.dsl.AstDSL; import org.opensearch.sql.ast.expression.Alias; import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Field; @@ -61,7 +61,6 @@ import org.opensearch.sql.ast.tree.Kmeans; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Parse; -import org.opensearch.sql.ast.tree.Pattern; import org.opensearch.sql.ast.tree.Project; import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.RareTopN.CommandType; @@ -71,6 +70,7 @@ import org.opensearch.sql.ast.tree.TableFunction; import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; +import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.utils.StringUtils; import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.ppl.antlr.parser.OpenSearchPPLParser; @@ -291,33 +291,32 @@ public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandCo UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); List unresolvedArguments = new ArrayList<>(); unresolvedArguments.add(sourceField); - ImmutableMap.Builder builder = ImmutableMap.builder(); + AtomicReference alias = new AtomicReference<>("patterns_field"); ctx.patternsParameter() .forEach( x -> { - builder.put( - x.children.get(0).toString(), - (Literal) internalVisitExpression(x.children.get(2))); - unresolvedArguments.add( - new Argument( - x.children.get(0).toString(), - (Literal) internalVisitExpression(x.children.get(2)))); + String argName = x.children.get(0).toString(); + Literal value = (Literal) internalVisitExpression(x.children.get(2)); + if ("new_field".equalsIgnoreCase(argName)) { + alias.set((String) value.getValue()); + } + unresolvedArguments.add(new Argument(argName, value)); }); - java.util.Map arguments = builder.build(); - Literal alias = arguments.getOrDefault("new_field", AstDSL.stringLiteral("patterns_field")); - return new Pattern( + return new Window( new Alias( - "patterns_field", + alias.get(), new WindowFunction( new Function( ctx.pattern_method != null - ? ctx.pattern_method.getText().toLowerCase(Locale.ROOT) - : BuiltinFunctionName.BRAIN.name(), // By default, use new algorithm + ? StringUtils.unquoteIdentifier(ctx.pattern_method.getText()) + .toLowerCase(Locale.ROOT) + : BuiltinFunctionName.BRAIN + .name() + .toLowerCase(Locale.ROOT), // By default, use new algorithm unresolvedArguments), List.of(), // ignore partition by list for now as we haven't seen such requirement List.of()), // ignore sort by list for now as we haven't seen such requirement - (String) alias.getValue()), - arguments); + alias.get())); } /** Top command. */ diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index 683da11657..4686164948 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -42,6 +42,7 @@ import static org.opensearch.sql.ast.dsl.AstDSL.tableFunction; import static org.opensearch.sql.ast.dsl.AstDSL.trendline; import static org.opensearch.sql.ast.dsl.AstDSL.unresolvedArg; +import static org.opensearch.sql.ast.dsl.AstDSL.window; import static org.opensearch.sql.ast.tree.Trendline.TrendlineType.SMA; import static org.opensearch.sql.utils.SystemIndexUtils.DATASOURCES_TABLE_NAME; import static org.opensearch.sql.utils.SystemIndexUtils.mappingTable; @@ -55,10 +56,12 @@ import org.junit.Test; import org.junit.rules.ExpectedException; import org.opensearch.sql.ast.Node; +import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.DataType; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Literal; import org.opensearch.sql.ast.expression.ParseMethod; +import org.opensearch.sql.ast.expression.PatternMethod; import org.opensearch.sql.ast.expression.SpanUnit; import org.opensearch.sql.ast.tree.AD; import org.opensearch.sql.ast.tree.FillNull; @@ -821,6 +824,41 @@ public void testShowDataSourcesCommand() { assertEqual("show datasources", relation(DATASOURCES_TABLE_NAME)); } + @Test + public void testPatternsCommand() { + assertEqual( + "source=t | patterns new_field=\"custom_field\" pattern=\"custom_pattern\" raw" + + " SIMPLE_PATTERN", + window( + relation("t"), + PatternMethod.SIMPLE_PATTERN, + field("raw"), + "custom_field", + Arrays.asList( + new Argument("new_field", new Literal("custom_field", DataType.STRING)), + new Argument("pattern", new Literal("custom_pattern", DataType.STRING))))); + + assertEqual( + "source=t | patterns variable_count_threshold=2 frequency_threshold_percentage=0.1 raw", + window( + relation("t"), + PatternMethod.BRAIN, + field("raw"), + "patterns_field", + Arrays.asList( + new Argument("variable_count_threshold", new Literal(2, DataType.INTEGER)), + new Argument( + "frequency_threshold_percentage", new Literal(0.1, DataType.DOUBLE))))); + } + + @Test + public void testPatternsWithoutArguments() { + assertEqual( + "source=t | patterns raw", + window( + relation("t"), PatternMethod.BRAIN, field("raw"), "patterns_field", Arrays.asList())); + } + protected void assertEqual(String query, Node expectedPlan) { Node actualPlan = plan(query); assertEquals(expectedPlan, actualPlan); From 9c6327a4c0f06cde44011da05ae2007ef440f7d3 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Tue, 28 Jan 2025 21:32:25 +0800 Subject: [PATCH 4/6] Add more unit tests Signed-off-by: Songkan Tang --- .../org/opensearch/sql/expression/DSL.java | 8 +- .../opensearch/sql/analysis/AnalyzerTest.java | 117 ++++++++++++++++++ .../parse/PatternsExpressionTest.java | 89 +++++++++++++ .../BufferPatternRowsWindowFrameTest.java | 88 +++++++++++++ .../BufferPatternRowsWindowFunctionTest.java | 77 ++++++++++++ .../StreamPatternWindowFunctionTest.java | 77 ++++++++++++ 6 files changed, 452 insertions(+), 4 deletions(-) create mode 100644 core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java create mode 100644 core/src/test/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrameTest.java create mode 100644 core/src/test/java/org/opensearch/sql/expression/window/patterns/BufferPatternRowsWindowFunctionTest.java create mode 100644 core/src/test/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunctionTest.java diff --git a/core/src/main/java/org/opensearch/sql/expression/DSL.java b/core/src/main/java/org/opensearch/sql/expression/DSL.java index fc449d74b5..3c146c906b 100644 --- a/core/src/main/java/org/opensearch/sql/expression/DSL.java +++ b/core/src/main/java/org/opensearch/sql/expression/DSL.java @@ -973,12 +973,12 @@ public static FunctionExpression utc_timestamp( return compile(functionProperties, BuiltinFunctionName.UTC_TIMESTAMP, args); } - public static FunctionExpression brain() { - return compile(FunctionProperties.None, BuiltinFunctionName.BRAIN); + public static FunctionExpression brain(Expression... args) { + return compile(FunctionProperties.None, BuiltinFunctionName.BRAIN, args); } - public static FunctionExpression simple_pattern() { - return compile(FunctionProperties.None, BuiltinFunctionName.SIMPLE_PATTERN); + public static FunctionExpression simple_pattern(Expression... args) { + return compile(FunctionProperties.None, BuiltinFunctionName.SIMPLE_PATTERN, args); } @SuppressWarnings("unchecked") diff --git a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java index aa3499bd44..3318c66d98 100644 --- a/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java +++ b/core/src/test/java/org/opensearch/sql/analysis/AnalyzerTest.java @@ -80,6 +80,7 @@ import org.opensearch.sql.ast.expression.HighlightFunction; import org.opensearch.sql.ast.expression.Literal; import org.opensearch.sql.ast.expression.ParseMethod; +import org.opensearch.sql.ast.expression.PatternMethod; import org.opensearch.sql.ast.expression.ScoreFunction; import org.opensearch.sql.ast.expression.SpanUnit; import org.opensearch.sql.ast.tree.AD; @@ -1819,4 +1820,120 @@ public void visit_close_cursor() { () -> assertEquals("pewpew", ((LogicalFetchCursor) analyzed.getChild().get(0)).getCursor())); } + + @Test + public void simple_pattern_window_function_with_no_additional_args() { + UnresolvedPlan unresolvedWindow = + AstDSL.project( + AstDSL.window( + AstDSL.relation("schema"), + PatternMethod.SIMPLE_PATTERN, + AstDSL.field("string_value"), + "patterns_field", + ImmutableList.of()), + AstDSL.field("string_value")); + LogicalPlan expectedPlan = + LogicalPlanDSL.project( + LogicalPlanDSL.window( + LogicalPlanDSL.relation("schema", table), + DSL.named( + "patterns_field", + DSL.simple_pattern(DSL.ref("string_value", STRING)), + "patterns_field"), + new WindowDefinition(ImmutableList.of(), ImmutableList.of())), + DSL.named("string_value", DSL.ref("string_value", STRING))); + + assertAnalyzeEqual(expectedPlan, unresolvedWindow); + } + + @Test + public void simple_pattern_window_function() { + UnresolvedPlan unresolvedWindow = + AstDSL.project( + AstDSL.window( + AstDSL.relation("schema"), + PatternMethod.SIMPLE_PATTERN, + AstDSL.field("string_value"), + "custom_field", + ImmutableList.of( + new Argument( + "pattern", AstDSL.stringLiteral("[0-9]")))), // with pattern argument + AstDSL.field("string_value")); + LogicalPlan expectedPlan = + LogicalPlanDSL.project( + LogicalPlanDSL.window( + LogicalPlanDSL.relation("schema", table), + DSL.named( + "custom_field", + DSL.simple_pattern( + DSL.ref("string_value", STRING), + DSL.namedArgument( + "pattern", DSL.literal("[0-9]"))), // with additional pattern argument + "custom_field"), + new WindowDefinition(ImmutableList.of(), ImmutableList.of())), + DSL.named("string_value", DSL.ref("string_value", STRING))); + + assertAnalyzeEqual(expectedPlan, unresolvedWindow); + } + + @Test + public void brain_window_function_with_no_additional_args() { + UnresolvedPlan unresolvedWindow = + AstDSL.project( + AstDSL.window( + AstDSL.relation("schema"), + PatternMethod.BRAIN, + AstDSL.field("string_value"), + "patterns_field", + ImmutableList.of()), + AstDSL.field("string_value")); + LogicalPlan expectedPlan = + LogicalPlanDSL.project( + LogicalPlanDSL.window( + LogicalPlanDSL.relation("schema", table), + DSL.named( + "patterns_field", DSL.brain(DSL.ref("string_value", STRING)), "patterns_field"), + new WindowDefinition(ImmutableList.of(), ImmutableList.of())), + DSL.named("string_value", DSL.ref("string_value", STRING))); + + assertAnalyzeEqual(expectedPlan, unresolvedWindow); + } + + @Test + public void brain_window_function() { + UnresolvedPlan unresolvedWindow = + AstDSL.project( + AstDSL.window( + AstDSL.relation("schema"), + PatternMethod.BRAIN, + AstDSL.field("string_value"), + "custom_field", + ImmutableList.of( + new Argument( + "variable_count_threshold", AstDSL.intLiteral(10)), // with integer argument + new Argument( + "frequency_threshold_percentage", + AstDSL.doubleLiteral(0.1)) // with double argument + )), + AstDSL.field("string_value")); + LogicalPlan expectedPlan = + LogicalPlanDSL.project( + LogicalPlanDSL.window( + LogicalPlanDSL.relation("schema", table), + DSL.named( + "custom_field", + DSL.brain( + DSL.ref("string_value", STRING), + DSL.namedArgument( + "variable_count_threshold", + DSL.literal(10)), // with additional integer argument + DSL.namedArgument( + "frequency_threshold_percentage", + DSL.literal(0.1))), // with additional double argument + "custom_field"), + new WindowDefinition(ImmutableList.of(), ImmutableList.of())), + DSL.named("string_value", DSL.ref("string_value", STRING))); + + assertAnalyzeEqual(expectedPlan, unresolvedWindow); + } } diff --git a/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java b/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java new file mode 100644 index 0000000000..7237f0673b --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/parse/PatternsExpressionTest.java @@ -0,0 +1,89 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.parse; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.when; +import static org.opensearch.sql.config.TestConfig.STRING_TYPE_MISSING_VALUE_FIELD; +import static org.opensearch.sql.config.TestConfig.STRING_TYPE_NULL_VALUE_FIELD; +import static org.opensearch.sql.data.model.ExprValueUtils.LITERAL_NULL; +import static org.opensearch.sql.data.model.ExprValueUtils.stringValue; +import static org.opensearch.sql.data.type.ExprCoreType.BOOLEAN; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; + +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.exception.SemanticCheckException; +import org.opensearch.sql.expression.DSL; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.ExpressionTestBase; +import org.opensearch.sql.expression.env.Environment; + +@ExtendWith(MockitoExtension.class) +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +class PatternsExpressionTest extends ExpressionTestBase { + + @Mock Environment env; + + @Test + public void resolve_value() { + when(DSL.ref("log_value", STRING).valueOf(env)) + .thenReturn( + stringValue( + "145.128.75.121 - - [29/Aug/2022:13:26:44 -0700] \"GET /deliverables HTTP/2.0\" 501" + + " 2721")); + assertEquals( + stringValue("... - - [//::: -] \" / /.\" "), + DSL.patterns(DSL.ref("log_value", STRING), DSL.literal(""), DSL.literal("punct_field")) + .valueOf(env)); + assertEquals( + stringValue("... - - [/Aug/::: -] \"GET /deliverables HTTP/.\" "), + DSL.patterns(DSL.ref("log_value", STRING), DSL.literal("[0-9]"), DSL.literal("regex_field")) + .valueOf(env)); + } + + @Test + public void resolve_null_and_missing_values() { + assertEquals( + LITERAL_NULL, + DSL.patterns( + DSL.ref(STRING_TYPE_NULL_VALUE_FIELD, STRING), + DSL.literal("pattern"), + DSL.literal("patterns_field")) + .valueOf(valueEnv())); + assertEquals( + LITERAL_NULL, + DSL.patterns( + DSL.ref(STRING_TYPE_MISSING_VALUE_FIELD, STRING), + DSL.literal("pattern"), + DSL.literal("patterns_field")) + .valueOf(valueEnv())); + } + + @Test + public void resolve_type() { + assertEquals( + STRING, + DSL.patterns(DSL.ref("string_value", STRING), DSL.literal("pattern"), DSL.literal("group")) + .type()); + } + + @Test + public void throws_semantic_exception_if_value_type_is_not_string() { + assertThrows( + SemanticCheckException.class, + () -> + DSL.patterns( + DSL.ref("boolean_value", BOOLEAN), DSL.literal("pattern"), DSL.literal("group")) + .valueOf(valueEnv())); + } +} diff --git a/core/src/test/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrameTest.java b/core/src/test/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrameTest.java new file mode 100644 index 0000000000..f946e1b0c7 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/window/frame/BufferPatternRowsWindowFrameTest.java @@ -0,0 +1,88 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.window.frame; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.opensearch.sql.data.model.ExprTupleValue.fromExprValueMap; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; +import org.junit.jupiter.api.DisplayNameGeneration; +import org.junit.jupiter.api.DisplayNameGenerator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.NamedArgumentExpression; +import org.opensearch.sql.expression.ReferenceExpression; +import org.opensearch.sql.expression.window.WindowDefinition; + +@DisplayNameGeneration(DisplayNameGenerator.ReplaceUnderscores.class) +@ExtendWith(MockitoExtension.class) +public class BufferPatternRowsWindowFrameTest { + + private final BufferPatternRowsWindowFrame windowFrame = + new BufferPatternRowsWindowFrame( + new WindowDefinition(ImmutableList.of(), ImmutableList.of()), + LOG_PARSER, + new NamedArgumentExpression("message", new ReferenceExpression("message", STRING))); + + @Test + void test_single_partition_for_all_rows() { + PeekingIterator tuples = + Iterators.peekingIterator( + Iterators.forArray( + tuple(TEST_MESSAGE_1), tuple(TEST_MESSAGE_2), tuple(TEST_MESSAGE_3))); + + windowFrame.load(tuples); + assertTrue(windowFrame.isNewPartition()); + assertEquals( + LOG_PARSER.preprocess(TEST_MESSAGE_1, "0"), windowFrame.currentPreprocessedMessage()); + assertEquals( + ImmutableList.of(tuple(TEST_MESSAGE_1), tuple(TEST_MESSAGE_2), tuple(TEST_MESSAGE_3)), + windowFrame.next()); + assertTrue(windowFrame.hasNext()); + + windowFrame.load(tuples); + assertFalse(windowFrame.isNewPartition()); + assertEquals( + LOG_PARSER.preprocess(TEST_MESSAGE_2, "1"), windowFrame.currentPreprocessedMessage()); + assertEquals(ImmutableList.of(), windowFrame.next()); + assertTrue(windowFrame.hasNext()); + + windowFrame.load(tuples); + assertFalse(windowFrame.isNewPartition()); + assertEquals( + LOG_PARSER.preprocess(TEST_MESSAGE_3, "2"), windowFrame.currentPreprocessedMessage()); + assertEquals(ImmutableList.of(), windowFrame.next()); + assertFalse(windowFrame.hasNext()); + } + + private ExprValue tuple(String message) { + return fromExprValueMap(ImmutableMap.of("message", new ExprStringValue(message))); + } + + private static final String TEST_MESSAGE_1 = + "12.132.31.17 - - [2018-07-22T05:36:25.812Z] \\\"GET /opensearch HTTP/1.1\\\" 200 9797" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_2 = + "129.138.185.193 - - [2018-07-22T05:39:39.668Z] \\\"GET /opensearch HTTP/1.1\\\" 404 9920" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_3 = + "240.58.187.246 - - [2018-07-22T06:02:46.006Z] \\\"GET /opensearch HTTP/1.1\\\" 500 6936" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko)" + + " Chrome/11.0.696.50 Safari/534.24\\\""; + private static final BrainLogParser LOG_PARSER = new BrainLogParser(); +} diff --git a/core/src/test/java/org/opensearch/sql/expression/window/patterns/BufferPatternRowsWindowFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/window/patterns/BufferPatternRowsWindowFunctionTest.java new file mode 100644 index 0000000000..2747adb113 --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/window/patterns/BufferPatternRowsWindowFunctionTest.java @@ -0,0 +1,77 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.window.patterns; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.opensearch.sql.data.model.ExprTupleValue.fromExprValueMap; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; +import java.util.Arrays; +import java.util.List; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.DSL; +import org.opensearch.sql.expression.NamedArgumentExpression; +import org.opensearch.sql.expression.ReferenceExpression; +import org.opensearch.sql.expression.window.WindowDefinition; +import org.opensearch.sql.expression.window.frame.BufferPatternRowsWindowFrame; + +public class BufferPatternRowsWindowFunctionTest { + + private final BufferPatternRowsWindowFrame windowFrame = + new BufferPatternRowsWindowFrame( + new WindowDefinition(ImmutableList.of(), ImmutableList.of()), + LOG_PARSER, + new NamedArgumentExpression("message", new ReferenceExpression("message", STRING))); + + @Test + void test_value_of() { + PeekingIterator tuples = + Iterators.peekingIterator( + Iterators.forArray( + tuple(TEST_MESSAGE_1), tuple(TEST_MESSAGE_2), tuple(TEST_MESSAGE_3))); + + BufferPatternWindowFunction brain = + (BufferPatternWindowFunction) DSL.brain(DSL.ref("message", STRING)); + List> preprocessedMessages = + LOG_PARSER.preprocessAllLogs(Arrays.asList(TEST_MESSAGE_1, TEST_MESSAGE_2, TEST_MESSAGE_3)); + windowFrame.load(tuples); + + assertEquals( + String.join(" ", LOG_PARSER.parseLogPattern(preprocessedMessages.get(0))), + brain.valueOf(windowFrame).stringValue()); + assertEquals( + String.join(" ", LOG_PARSER.parseLogPattern(preprocessedMessages.get(1))), + brain.valueOf(windowFrame).stringValue()); + assertEquals( + String.join(" ", LOG_PARSER.parseLogPattern(preprocessedMessages.get(2))), + brain.valueOf(windowFrame).stringValue()); + } + + private ExprValue tuple(String message) { + return fromExprValueMap(ImmutableMap.of("message", new ExprStringValue(message))); + } + + private static final String TEST_MESSAGE_1 = + "12.132.31.17 - - [2018-07-22T05:36:25.812Z] \\\"GET /opensearch HTTP/1.1\\\" 200 9797" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_2 = + "129.138.185.193 - - [2018-07-22T05:39:39.668Z] \\\"GET /opensearch HTTP/1.1\\\" 404 9920" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_3 = + "240.58.187.246 - - [2018-07-22T06:02:46.006Z] \\\"GET /opensearch HTTP/1.1\\\" 500 6936" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko)" + + " Chrome/11.0.696.50 Safari/534.24\\\""; + private static final BrainLogParser LOG_PARSER = new BrainLogParser(); +} diff --git a/core/src/test/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunctionTest.java new file mode 100644 index 0000000000..2b1492b72e --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/window/patterns/StreamPatternWindowFunctionTest.java @@ -0,0 +1,77 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.window.patterns; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.opensearch.sql.data.model.ExprTupleValue.fromExprValueMap; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.DSL; +import org.opensearch.sql.expression.LiteralExpression; +import org.opensearch.sql.expression.parse.PatternsExpression; +import org.opensearch.sql.expression.window.WindowDefinition; +import org.opensearch.sql.expression.window.frame.CurrentRowWindowFrame; + +public class StreamPatternWindowFunctionTest { + + private final CurrentRowWindowFrame windowFrame = + new CurrentRowWindowFrame(new WindowDefinition(ImmutableList.of(), ImmutableList.of())); + + @Test + void test_value_of() { + PeekingIterator tuples = + Iterators.peekingIterator( + Iterators.forArray( + tuple(TEST_MESSAGE_1), tuple(TEST_MESSAGE_2), tuple(TEST_MESSAGE_3))); + + StreamPatternWindowFunction simplePattern = + (StreamPatternWindowFunction) DSL.simple_pattern(DSL.ref("message", STRING)); + PatternsExpression patternsExpression = + DSL.patterns( + DSL.ref("message", STRING), + new LiteralExpression(new ExprStringValue("")), + new LiteralExpression(new ExprStringValue(""))); + + windowFrame.load(tuples); + assertEquals( + patternsExpression.parseValue(new ExprStringValue(TEST_MESSAGE_1)), + simplePattern.valueOf(windowFrame)); + windowFrame.load(tuples); + assertEquals( + patternsExpression.parseValue(new ExprStringValue(TEST_MESSAGE_2)), + simplePattern.valueOf(windowFrame)); + windowFrame.load(tuples); + assertEquals( + patternsExpression.parseValue(new ExprStringValue(TEST_MESSAGE_3)), + simplePattern.valueOf(windowFrame)); + } + + private ExprValue tuple(String message) { + return fromExprValueMap(ImmutableMap.of("message", new ExprStringValue(message))); + } + + private static final String TEST_MESSAGE_1 = + "12.132.31.17 - - [2018-07-22T05:36:25.812Z] \\\"GET /opensearch HTTP/1.1\\\" 200 9797" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_2 = + "129.138.185.193 - - [2018-07-22T05:39:39.668Z] \\\"GET /opensearch HTTP/1.1\\\" 404 9920" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_3 = + "240.58.187.246 - - [2018-07-22T06:02:46.006Z] \\\"GET /opensearch HTTP/1.1\\\" 500 6936" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko)" + + " Chrome/11.0.696.50 Safari/534.24\\\""; + private static final BrainLogParser LOG_PARSER = new BrainLogParser(); +} From bda53df4a4c975552091cf0d184bdf1dc9a184c6 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Tue, 28 Jan 2025 22:11:57 +0800 Subject: [PATCH 5/6] Amend patterns user facing doc Signed-off-by: Songkan Tang --- docs/user/ppl/cmd/patterns.rst | 72 ++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/docs/user/ppl/cmd/patterns.rst b/docs/user/ppl/cmd/patterns.rst index 13f08d0aa6..fa5b6178ae 100644 --- a/docs/user/ppl/cmd/patterns.rst +++ b/docs/user/ppl/cmd/patterns.rst @@ -12,24 +12,45 @@ patterns Description ============ | The ``patterns`` command extracts log patterns from a text field and appends the results to the search result. Grouping logs by their patterns makes it easier to aggregate stats from large volumes of log data for analysis and troubleshooting. +| ``patterns`` command now allows user to select different log parsing algorithms to get high log pattern grouping accuracy. Two pattern methods are supported, aka ``simple_pattern`` and ``brain``. +| ``simple_pattern`` algorithm is basically a regex parsing method vs ``brain`` algorithm is an automatic log grouping algorithm with high accuracy and keeps semantic meaning. Syntax ============ -patterns [new_field=] [pattern=] +patterns [new_field=] (algorithm parameters...) + +Simple Pattern +============ +patterns [new_field=] [pattern=] SIMPLE_PATTERN * new-field-name: optional string. The name of the new field for extracted patterns, default is ``patterns_field``. If the name already exists, it will replace the original field. * pattern: optional string. The regex pattern of characters that should be filtered out from the text field. If absent, the default pattern is alphanumeric characters (``[a-zA-Z\d]``). * field: mandatory. The field must be a text field. +* SIMPLE_PATTERN: Specify pattern method to be simple_pattern. By default, it's brain if not specified. + +Brain +============ +patterns [new_field=] [variable_count_threshold=] [frequency_threshold_percentage=] BRAIN -Example 1: Create the new field +or + +patterns [new_field=] [variable_count_threshold=] [frequency_threshold_percentage=] + +* new-field-name: optional string. The name of the new field for extracted patterns, default is ``patterns_field``. If the name already exists, it will replace the original field. +* variable_count_threshold: optional integer. Number of tokens in the group per position as variable threshold in case of word tokens appear rarely. +* frequency_threshold_percentage: optional double. To select longest word combination frequency, it needs a lower bound of frequency. The representative frequency of longest word combination should be >= highest token frequency of log * threshold percentage +* field: mandatory. The field must be a text field. +* BRAIN: Specify pattern method to be simple_pattern. By default, it's brain if not specified. + +Simple Pattern Example 1: Create the new field =============================== The example shows how to use extract punctuations in ``email`` for each document. Parsing a null field will return an empty string. PPL query:: - os> source=accounts | patterns email | fields email, patterns_field ; + os> source=accounts | patterns email SIMPLE_PATTERN | fields email, patterns_field ; fetched rows / total rows = 4/4 +-----------------------+----------------+ | email | patterns_field | @@ -40,14 +61,14 @@ PPL query:: | daleadams@boink.com | @. | +-----------------------+----------------+ -Example 2: Extract log patterns +Simple Pattern Example 2: Extract log patterns =============================== The example shows how to extract punctuations from a raw log field using the default patterns. PPL query:: - os> source=apache | patterns message | fields message, patterns_field ; + os> source=apache | patterns message SIMPLE_PATTERN | fields message, patterns_field ; fetched rows / total rows = 4/4 +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------+ | message | patterns_field | @@ -58,14 +79,14 @@ PPL query:: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | ... - - [//::: -] " / /." | +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------+ -Example 3: Extract log patterns with custom regex pattern +Simple Pattern Example 3: Extract log patterns with custom regex pattern ========================================================= The example shows how to extract punctuations from a raw log field using user defined patterns. PPL query:: - os> source=apache | patterns new_field='no_numbers' pattern='[0-9]' message | fields message, no_numbers ; + os> source=apache | patterns new_field='no_numbers' pattern='[0-9]' message SIMPLE_PATTERN | fields message, no_numbers ; fetched rows / total rows = 4/4 +-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+ | message | no_numbers | @@ -76,6 +97,43 @@ PPL query:: | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | ... - - [/Sep/::: -] "POST /users HTTP/." | +-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+ +Brain Example 1: Extract log patterns +=============================== + +The example shows how to extract semantic meaningful log patterns from a raw log field using the brain algorithm. The default variable count threshold is 5. + +PPL query:: + + os> source=apache | patterns message | fields message, patterns_field ; + fetched rows / total rows = 4/4 + +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ + | message | patterns_field | + |-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*><*>" 404 <*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*><*>" 100 <*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*><*>" 401 <*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*><*>" 301 <*> | + +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ + +Brain Example 3: Extract log patterns with custom parameters +=============================== + +The example shows how to extract semantic meaningful log patterns from a raw log field using defined parameter of brain algorithm. + +PPL query:: + + os> source=apache | patterns variable_count_threshold=2 message | fields message, patterns_field ; + fetched rows / total rows = 4/4 + +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+ + | message | patterns_field | + |-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------| + | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | + | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | + | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | + | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*><*>" <*> <*> | + +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+ + + Limitation ========== From 786dcee272da227ac4e41f327488db67804c07e6 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Wed, 29 Jan 2025 17:31:01 +0800 Subject: [PATCH 6/6] Add average time benchmark for patterns window functions Signed-off-by: Songkan Tang --- .../PatternsWindowFunctionBenchmark.java | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 benchmarks/src/jmh/java/org/opensearch/sql/expression/operator/predicate/PatternsWindowFunctionBenchmark.java diff --git a/benchmarks/src/jmh/java/org/opensearch/sql/expression/operator/predicate/PatternsWindowFunctionBenchmark.java b/benchmarks/src/jmh/java/org/opensearch/sql/expression/operator/predicate/PatternsWindowFunctionBenchmark.java new file mode 100644 index 0000000000..a4ec4e1c55 --- /dev/null +++ b/benchmarks/src/jmh/java/org/opensearch/sql/expression/operator/predicate/PatternsWindowFunctionBenchmark.java @@ -0,0 +1,98 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.operator.predicate; + +import static org.opensearch.sql.data.model.ExprTupleValue.fromExprValueMap; +import static org.opensearch.sql.data.type.ExprCoreType.STRING; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.opensearch.sql.common.patterns.BrainLogParser; +import org.opensearch.sql.data.model.ExprStringValue; +import org.opensearch.sql.data.model.ExprValue; +import org.opensearch.sql.expression.DSL; +import org.opensearch.sql.expression.Expression; +import org.opensearch.sql.expression.NamedArgumentExpression; +import org.opensearch.sql.expression.ReferenceExpression; +import org.opensearch.sql.expression.window.WindowDefinition; +import org.opensearch.sql.expression.window.frame.BufferPatternRowsWindowFrame; +import org.opensearch.sql.expression.window.frame.CurrentRowWindowFrame; +import org.opensearch.sql.expression.window.frame.WindowFrame; + +@Warmup(iterations = 1) +@Measurement(iterations = 3) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(value = 1) +public class PatternsWindowFunctionBenchmark { + + private static final String TEST_MESSAGE_1 = + "12.132.31.17 - - [2018-07-22T05:36:25.812Z] \\\"GET /opensearch HTTP/1.1\\\" 200 9797" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_2 = + "129.138.185.193 - - [2018-07-22T05:39:39.668Z] \\\"GET /opensearch HTTP/1.1\\\" 404 9920" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421" + + " Firefox/6.0a1\\\""; + private static final String TEST_MESSAGE_3 = + "240.58.187.246 - - [2018-07-22T06:02:46.006Z] \\\"GET /opensearch HTTP/1.1\\\" 500 6936" + + " \\\"-\\\" \\\"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko)" + + " Chrome/11.0.696.50 Safari/534.24\\\""; + + private PeekingIterator tuples; + private final BufferPatternRowsWindowFrame bufferWindowFrame = + new BufferPatternRowsWindowFrame( + new WindowDefinition(ImmutableList.of(), ImmutableList.of()), + new BrainLogParser(), + new NamedArgumentExpression("message", new ReferenceExpression("message", STRING))); + + @Benchmark + public void testSimplePattern() { + CurrentRowWindowFrame windowFrame = + new CurrentRowWindowFrame(new WindowDefinition(ImmutableList.of(), ImmutableList.of())); + + run(windowFrame, DSL.simple_pattern(DSL.ref("message", STRING))); + } + + @Benchmark + public void testBrain() { + BufferPatternRowsWindowFrame windowFrame = + new BufferPatternRowsWindowFrame( + new WindowDefinition(ImmutableList.of(), ImmutableList.of()), + new BrainLogParser(), + new NamedArgumentExpression("message", new ReferenceExpression("message", STRING))); + + run(windowFrame, DSL.brain(DSL.ref("message", STRING))); + } + + private void run(WindowFrame windowFrame, Expression windowFunction) { + tuples = + Iterators.peekingIterator( + Iterators.forArray( + tuple(TEST_MESSAGE_1), tuple(TEST_MESSAGE_2), tuple(TEST_MESSAGE_3))); + while (tuples.hasNext() || windowFrame.hasNext()) { + windowFrame.load(tuples); + windowFunction.valueOf(windowFrame); + } + } + + private ExprValue tuple(String message) { + return fromExprValueMap(ImmutableMap.of("message", new ExprStringValue(message))); + } +}