Skip to content

Commit cd21e22

Browse files
committed
Implement T-SQL parser for simple SELECT statements
Add a lexer and parser that can parse the OptimizerHintsTests110 test case: - SELECT * FROM t1 OPTION (IGNORE_NONCLUSTERED_COLUMNSTORE_INDEX) This implements: - Lexer with support for keywords, identifiers, operators, and comments - Parser for SELECT statements with FROM clause and OPTION hints - OptimizerHint AST type and JSON marshaling - Enable the OptimizerHintsTests110 test case
1 parent 7541fc0 commit cd21e22

File tree

5 files changed

+685
-6
lines changed

5 files changed

+685
-6
lines changed

ast/ast.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,20 @@ type Statement interface {
2828

2929
// SelectStatement represents a SELECT statement.
3030
type SelectStatement struct {
31-
QueryExpression QueryExpression `json:"QueryExpression,omitempty"`
31+
QueryExpression QueryExpression `json:"QueryExpression,omitempty"`
32+
OptimizerHints []*OptimizerHint `json:"OptimizerHints,omitempty"`
3233
}
3334

3435
func (*SelectStatement) node() {}
3536
func (*SelectStatement) statement() {}
3637

38+
// OptimizerHint represents an optimizer hint in an OPTION clause.
39+
type OptimizerHint struct {
40+
HintKind string `json:"HintKind,omitempty"`
41+
}
42+
43+
func (*OptimizerHint) node() {}
44+
3745
// QueryExpression is the interface for query expressions.
3846
type QueryExpression interface {
3947
Node

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module github.com/kyleconroy/teesql
22

3-
go 1.25
3+
go 1.21

parser/lexer.go

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
package parser
2+
3+
import (
4+
"strings"
5+
"unicode"
6+
)
7+
8+
// TokenType represents the type of a token.
9+
type TokenType int
10+
11+
const (
12+
TokenEOF TokenType = iota
13+
TokenError
14+
TokenIdent
15+
TokenNumber
16+
TokenString
17+
TokenStar
18+
TokenComma
19+
TokenDot
20+
TokenLParen
21+
TokenRParen
22+
TokenLBracket
23+
TokenRBracket
24+
TokenSemicolon
25+
TokenEquals
26+
TokenLessThan
27+
TokenGreaterThan
28+
TokenPlus
29+
TokenMinus
30+
31+
// Keywords
32+
TokenSelect
33+
TokenFrom
34+
TokenWhere
35+
TokenAnd
36+
TokenOr
37+
TokenAs
38+
TokenOption
39+
TokenAll
40+
TokenDistinct
41+
)
42+
43+
// Token represents a lexical token.
44+
type Token struct {
45+
Type TokenType
46+
Literal string
47+
Pos int
48+
}
49+
50+
// Lexer tokenizes T-SQL input.
51+
type Lexer struct {
52+
input string
53+
pos int
54+
readPos int
55+
ch byte
56+
}
57+
58+
// NewLexer creates a new Lexer for the given input.
59+
func NewLexer(input string) *Lexer {
60+
l := &Lexer{input: input}
61+
l.readChar()
62+
return l
63+
}
64+
65+
func (l *Lexer) readChar() {
66+
if l.readPos >= len(l.input) {
67+
l.ch = 0
68+
} else {
69+
l.ch = l.input[l.readPos]
70+
}
71+
l.pos = l.readPos
72+
l.readPos++
73+
}
74+
75+
func (l *Lexer) peekChar() byte {
76+
if l.readPos >= len(l.input) {
77+
return 0
78+
}
79+
return l.input[l.readPos]
80+
}
81+
82+
// NextToken returns the next token from the input.
83+
func (l *Lexer) NextToken() Token {
84+
l.skipWhitespaceAndComments()
85+
86+
tok := Token{Pos: l.pos}
87+
88+
switch l.ch {
89+
case 0:
90+
tok.Type = TokenEOF
91+
tok.Literal = ""
92+
case '*':
93+
tok.Type = TokenStar
94+
tok.Literal = "*"
95+
l.readChar()
96+
case ',':
97+
tok.Type = TokenComma
98+
tok.Literal = ","
99+
l.readChar()
100+
case '.':
101+
tok.Type = TokenDot
102+
tok.Literal = "."
103+
l.readChar()
104+
case '(':
105+
tok.Type = TokenLParen
106+
tok.Literal = "("
107+
l.readChar()
108+
case ')':
109+
tok.Type = TokenRParen
110+
tok.Literal = ")"
111+
l.readChar()
112+
case '[':
113+
tok = l.readBracketedIdentifier()
114+
case ']':
115+
tok.Type = TokenRBracket
116+
tok.Literal = "]"
117+
l.readChar()
118+
case ';':
119+
tok.Type = TokenSemicolon
120+
tok.Literal = ";"
121+
l.readChar()
122+
case '=':
123+
tok.Type = TokenEquals
124+
tok.Literal = "="
125+
l.readChar()
126+
case '<':
127+
tok.Type = TokenLessThan
128+
tok.Literal = "<"
129+
l.readChar()
130+
case '>':
131+
tok.Type = TokenGreaterThan
132+
tok.Literal = ">"
133+
l.readChar()
134+
case '+':
135+
tok.Type = TokenPlus
136+
tok.Literal = "+"
137+
l.readChar()
138+
case '-':
139+
tok.Type = TokenMinus
140+
tok.Literal = "-"
141+
l.readChar()
142+
case '\'':
143+
tok = l.readString()
144+
default:
145+
if isLetter(l.ch) || l.ch == '_' || l.ch == '@' || l.ch == '#' {
146+
tok = l.readIdentifier()
147+
} else if isDigit(l.ch) {
148+
tok = l.readNumber()
149+
} else {
150+
tok.Type = TokenError
151+
tok.Literal = string(l.ch)
152+
l.readChar()
153+
}
154+
}
155+
156+
return tok
157+
}
158+
159+
func (l *Lexer) skipWhitespaceAndComments() {
160+
for {
161+
// Skip whitespace
162+
for l.ch != 0 && (l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r') {
163+
l.readChar()
164+
}
165+
166+
// Skip line comments (-- ...)
167+
if l.ch == '-' && l.peekChar() == '-' {
168+
for l.ch != 0 && l.ch != '\n' {
169+
l.readChar()
170+
}
171+
continue
172+
}
173+
174+
// Skip block comments (/* ... */)
175+
if l.ch == '/' && l.peekChar() == '*' {
176+
l.readChar() // skip /
177+
l.readChar() // skip *
178+
for l.ch != 0 {
179+
if l.ch == '*' && l.peekChar() == '/' {
180+
l.readChar() // skip *
181+
l.readChar() // skip /
182+
break
183+
}
184+
l.readChar()
185+
}
186+
continue
187+
}
188+
189+
break
190+
}
191+
}
192+
193+
func (l *Lexer) readIdentifier() Token {
194+
startPos := l.pos
195+
for isLetter(l.ch) || isDigit(l.ch) || l.ch == '_' || l.ch == '@' || l.ch == '#' {
196+
l.readChar()
197+
}
198+
literal := l.input[startPos:l.pos]
199+
return Token{
200+
Type: lookupKeyword(literal),
201+
Literal: literal,
202+
Pos: startPos,
203+
}
204+
}
205+
206+
func (l *Lexer) readBracketedIdentifier() Token {
207+
startPos := l.pos
208+
l.readChar() // skip opening [
209+
for l.ch != 0 && l.ch != ']' {
210+
l.readChar()
211+
}
212+
if l.ch == ']' {
213+
l.readChar() // skip closing ]
214+
}
215+
return Token{
216+
Type: TokenIdent,
217+
Literal: l.input[startPos:l.pos],
218+
Pos: startPos,
219+
}
220+
}
221+
222+
func (l *Lexer) readString() Token {
223+
startPos := l.pos
224+
l.readChar() // skip opening quote
225+
for l.ch != 0 {
226+
if l.ch == '\'' {
227+
if l.peekChar() == '\'' {
228+
// Escaped quote
229+
l.readChar()
230+
l.readChar()
231+
continue
232+
}
233+
break
234+
}
235+
l.readChar()
236+
}
237+
if l.ch == '\'' {
238+
l.readChar() // skip closing quote
239+
}
240+
return Token{
241+
Type: TokenString,
242+
Literal: l.input[startPos:l.pos],
243+
Pos: startPos,
244+
}
245+
}
246+
247+
func (l *Lexer) readNumber() Token {
248+
startPos := l.pos
249+
for isDigit(l.ch) {
250+
l.readChar()
251+
}
252+
// Handle decimal point
253+
if l.ch == '.' && isDigit(l.peekChar()) {
254+
l.readChar()
255+
for isDigit(l.ch) {
256+
l.readChar()
257+
}
258+
}
259+
return Token{
260+
Type: TokenNumber,
261+
Literal: l.input[startPos:l.pos],
262+
Pos: startPos,
263+
}
264+
}
265+
266+
func isLetter(ch byte) bool {
267+
return unicode.IsLetter(rune(ch))
268+
}
269+
270+
func isDigit(ch byte) bool {
271+
return ch >= '0' && ch <= '9'
272+
}
273+
274+
var keywords = map[string]TokenType{
275+
"SELECT": TokenSelect,
276+
"FROM": TokenFrom,
277+
"WHERE": TokenWhere,
278+
"AND": TokenAnd,
279+
"OR": TokenOr,
280+
"AS": TokenAs,
281+
"OPTION": TokenOption,
282+
"ALL": TokenAll,
283+
"DISTINCT": TokenDistinct,
284+
}
285+
286+
func lookupKeyword(ident string) TokenType {
287+
if tok, ok := keywords[strings.ToUpper(ident)]; ok {
288+
return tok
289+
}
290+
return TokenIdent
291+
}

0 commit comments

Comments
 (0)