Skip to content

Commit b7cf658

Browse files
committed
Thinking Blocks Support
Introduced abstraction of Thinking Block Defined static thinking tags Introduced dynamic thinking tags discovery SPI to extract Thinking Blocks with restricted access
1 parent 1e1ef06 commit b7cf658

6 files changed

Lines changed: 860 additions & 0 deletions

File tree

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Copyright 2024-2025 Embabel Software, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.embabel.common.core.thinking
17+
18+
/**
19+
* Represents a thinking block extracted from LLM output.
20+
*
21+
* This class encapsulates thinking content that LLMs generate as part of their
22+
* reasoning process, along with metadata about the format used.
23+
*
24+
* @property content The extracted thinking text with all markup removed
25+
* @property tagType The type of thinking pattern - see [ThinkingTagType] for available types
26+
* @property tagValue The specific tag identifier used (e.g., "think", "analysis", "THINKING")
27+
*
28+
* @see ThinkingTagType for the different pattern classifications
29+
* @see ThinkingTags.TAG_DEFINITIONS for supported tag formats
30+
*/
31+
data class ThinkingBlock(
32+
/**
33+
* The extracted thinking content with all markup tags removed.
34+
* Contains only the inner reasoning text.
35+
*/
36+
val content: String,
37+
38+
/**
39+
* The type of thinking pattern that was detected.
40+
* @see ThinkingTagType
41+
*/
42+
val tagType: ThinkingTagType,
43+
44+
/**
45+
* The specific tag identifier that was used.
46+
* For [ThinkingTagType.TAG]: the tag name (e.g., "think", "analysis")
47+
* For [ThinkingTagType.PREFIX]: the prefix identifier (e.g., "THINKING")
48+
* For [ThinkingTagType.NO_PREFIX]: empty string
49+
*/
50+
val tagValue: String
51+
)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright 2024-2025 Embabel Software, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.embabel.common.core.thinking
17+
18+
/**
19+
* Classification of thinking content patterns for processing.
20+
*/
21+
enum class ThinkingTagType {
22+
TAG,
23+
PREFIX,
24+
NO_PREFIX
25+
}
26+
27+
/**
28+
* Centralized definitions for thinking content patterns across different LLM providers.
29+
*/
30+
object ThinkingTags {
31+
32+
/**
33+
* Comprehensive mapping of thinking tag patterns.
34+
*/
35+
val TAG_DEFINITIONS = mapOf(
36+
"think" to ("<think>" to "</think>"),
37+
"analysis" to ("<analysis>" to "</analysis>"),
38+
"thought" to ("<thought>" to "</thought>"),
39+
"final" to ("<final>" to "</final>"),
40+
"scratchpad" to ("<scratchpad>" to "</scratchpad>"),
41+
"chain_of_thought" to ("<chain_of_thought>" to "</chain_of_thought>"),
42+
"reasoning" to ("[REASONING]" to "[/REASONING]"),
43+
"legacy_prefix" to ("//THINKING:" to ""),
44+
"no_prefix" to ("" to "(?=\\{)")
45+
)
46+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright 2024-2025 Embabel Software, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.embabel.common.core.thinking.spi
17+
18+
/**
19+
* Marks APIs that are internal thinking processing utilities.
20+
*
21+
* These APIs are intended for use by converters and internal processing
22+
* components, not end-user code. Use with caution as they may change
23+
* without notice.
24+
*/
25+
@RequiresOptIn(
26+
message = "This is an internal thinking extraction API. Use with caution as it may change without notice.",
27+
level = RequiresOptIn.Level.ERROR
28+
)
29+
@Target(AnnotationTarget.CLASS, AnnotationTarget.FUNCTION, AnnotationTarget.PROPERTY)
30+
annotation class InternalThinkingApi
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright 2024-2025 Embabel Software, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.embabel.common.core.thinking.spi
17+
18+
import com.embabel.common.core.thinking.ThinkingBlock
19+
import com.embabel.common.core.thinking.ThinkingTagType
20+
import com.embabel.common.core.thinking.ThinkingTags
21+
22+
/**
23+
* Discovers and extracts dynamic XML-style thinking tags not predefined in ThinkingTags.
24+
*
25+
* Searches for valid XML tag patterns and returns thinking blocks for any
26+
* tags that weren't already extracted by predefined ThinkingTags processing.
27+
* This allows for flexible detection of new or custom thinking tags.
28+
*
29+
* @param input The text to search for dynamic tags
30+
* @param existingBlocks Already extracted blocks to avoid duplicates
31+
* @return List of ThinkingBlocks found with dynamic tags
32+
*/
33+
internal fun dynamicTagsDiscoveryAndExtraction(input: String, existingBlocks: List<ThinkingBlock>): List<ThinkingBlock> {
34+
val blocks = mutableListOf<ThinkingBlock>()
35+
36+
/**
37+
* Build set of already extracted tag values to prevent duplicate extraction.
38+
* Only considers TAG type blocks since we're looking for XML-style tag conflicts.
39+
*/
40+
val existingTagValues = existingBlocks
41+
.filter { it.tagType == ThinkingTagType.TAG }
42+
.map { it.tagValue }
43+
.toSet()
44+
45+
/**
46+
* Find and extract dynamic XML tags using regex pattern matching.
47+
*
48+
* Regex groups:
49+
* - Group 1: Opening tag name (e.g., "plan" from "<plan>")
50+
* - Group 2: Content between opening and closing tags
51+
* - Group 3: Closing tag name (e.g., "plan" from "</plan>")
52+
*
53+
* Validation ensures opening/closing tags match and content exists.
54+
*/
55+
DynamicTagPatterns.COMPLETE_TAG_PATTERN.findAll(input).forEach { match ->
56+
val openingTag = match.groupValues[1] // Group 1: opening tag name (e.g., "analysis" from "<analysis>")
57+
val content = match.groupValues[2].trim() // Group 2: content between tags
58+
val closingTag = match.groupValues[3] // Group 3: closing tag name (e.g., "analysis" from "</analysis>")
59+
60+
// Only add if opening/closing tags match, not already extracted, and has content
61+
if (openingTag == closingTag && openingTag !in existingTagValues && content.isNotEmpty()) {
62+
blocks.add(
63+
ThinkingBlock(
64+
content = content,
65+
tagType = ThinkingTagType.TAG,
66+
tagValue = openingTag
67+
)
68+
)
69+
}
70+
}
71+
72+
return blocks
73+
}
74+
75+
/**
76+
* Static patterns for dynamic tag discovery.
77+
*/
78+
private object DynamicTagPatterns {
79+
/**
80+
* Pattern for complete XML tags: <tagname>content</tagname>
81+
* Group 1: opening tag name, Group 2: content, Group 3: closing tag name
82+
*/
83+
val COMPLETE_TAG_PATTERN = "<([a-zA-Z][a-zA-Z0-9_-]*)[^>]*>(.*?)</([a-zA-Z][a-zA-Z0-9_-]*)>".toRegex(RegexOption.DOT_MATCHES_ALL)
84+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright 2024-2025 Embabel Software, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.embabel.common.core.thinking.spi
17+
18+
import com.embabel.common.core.thinking.ThinkingBlock
19+
import com.embabel.common.core.thinking.ThinkingTagType
20+
import com.embabel.common.core.thinking.ThinkingTags
21+
22+
/**
23+
* Extract all thinking blocks from input text.
24+
*
25+
* Processes the input text to find and extract all thinking content
26+
* in various formats (tagged, prefix, or untagged), returning detailed
27+
* metadata about each block found.
28+
*/
29+
@InternalThinkingApi
30+
fun extractAllThinkingBlocks(input: String): List<ThinkingBlock> {
31+
val blocks = mutableListOf<ThinkingBlock>()
32+
33+
// Extract thinking blocks in priority order: Tags (most common) → Prefix → No Prefix (least common)
34+
35+
// 1. First: Handle both predefined and dynamic XML-style tags (most common)
36+
37+
// 1a. Extract predefined tags from ThinkingTags
38+
ThinkingTags.TAG_DEFINITIONS.forEach { (tagKey, tagPair) ->
39+
if (tagKey !in listOf("legacy_prefix", "no_prefix")) {
40+
val (startTag, endTag) = tagPair
41+
if (startTag.isNotEmpty() && endTag.isNotEmpty()) {
42+
val escapedStart = Regex.escape(startTag)
43+
val escapedEnd = Regex.escape(endTag)
44+
val pattern = "$escapedStart(.*?)$escapedEnd".toRegex(RegexOption.DOT_MATCHES_ALL)
45+
46+
pattern.findAll(input).forEach { match ->
47+
blocks.add(
48+
ThinkingBlock(
49+
content = match.groupValues[1].trim(),
50+
tagType = ThinkingTagType.TAG,
51+
tagValue = tagKey
52+
)
53+
)
54+
}
55+
}
56+
}
57+
}
58+
59+
// 1b. Extract any additional dynamic XML-style tags not in predefined list
60+
blocks.addAll(dynamicTagsDiscoveryAndExtraction(input, blocks))
61+
62+
// 2. Second: Handle //THINKING: prefix pattern (less common)
63+
val prefixPattern = "//THINKING:(.*)".toRegex(RegexOption.MULTILINE)
64+
prefixPattern.findAll(input).forEach { match ->
65+
blocks.add(
66+
ThinkingBlock(
67+
content = match.groupValues[1].trim(),
68+
tagType = ThinkingTagType.PREFIX,
69+
tagValue = "THINKING"
70+
)
71+
)
72+
}
73+
74+
// 3. Last: Handle content before JSON pattern (fallback, least common)
75+
// Extract any remaining content that's not inside tags or prefix lines
76+
var remainingInput = input
77+
78+
/**
79+
* Remove all extracted tagged content from input to prevent NO_PREFIX false positives.
80+
*
81+
* Handles both predefined tags (using ThinkingTags definitions) and dynamic tags
82+
* (using standard XML patterns). This ensures that already-extracted content
83+
* doesn't get picked up again as untagged NO_PREFIX content.
84+
*/
85+
blocks.filter { it.tagType == ThinkingTagType.TAG }.forEach { block ->
86+
val tagDefinition = ThinkingTags.TAG_DEFINITIONS[block.tagValue]
87+
if (tagDefinition != null) {
88+
// Remove predefined tags using their specific ThinkingTags format
89+
// Example: <think>content</think> or [REASONING]content[/REASONING]
90+
val (startTag, endTag) = tagDefinition
91+
val escapedStart = Regex.escape(startTag)
92+
val escapedEnd = Regex.escape(endTag)
93+
val pattern = "$escapedStart.*?$escapedEnd".toRegex(RegexOption.DOT_MATCHES_ALL)
94+
remainingInput = remainingInput.replace(pattern, "")
95+
} else {
96+
// Remove dynamic tags using standard XML pattern <tagname>content</tagname>
97+
// Escapes tag name for regex safety (e.g., "custom-tag" becomes "custom\-tag")
98+
// Pattern matches: <tagname optional-attrs>content</tagname>
99+
val escapedTagName = Regex.escape(block.tagValue)
100+
val pattern = "<$escapedTagName[^>]*>.*?</$escapedTagName>".toRegex(RegexOption.DOT_MATCHES_ALL)
101+
remainingInput = remainingInput.replace(pattern, "")
102+
}
103+
}
104+
105+
// Remove all prefix lines
106+
remainingInput = remainingInput.replace("//THINKING:.*".toRegex(RegexOption.MULTILINE), "")
107+
108+
// Extract remaining content before JSON
109+
val noPrefixPattern = "^(.*?)(?=\\{)".toRegex(RegexOption.DOT_MATCHES_ALL)
110+
noPrefixPattern.find(remainingInput.trim())?.let { match ->
111+
val content = match.groupValues[1].trim()
112+
if (content.isNotEmpty()) {
113+
blocks.add(
114+
ThinkingBlock(
115+
content = content,
116+
tagType = ThinkingTagType.NO_PREFIX,
117+
tagValue = ""
118+
)
119+
)
120+
}
121+
}
122+
123+
return blocks.sortedBy { input.indexOf(it.content) }
124+
}

0 commit comments

Comments
 (0)