|
| 1 | +package parser |
| 2 | + |
| 3 | +import ( |
| 4 | + "strings" |
| 5 | + "unicode" |
| 6 | +) |
| 7 | + |
| 8 | +// TokenType represents the type of a token. |
| 9 | +type TokenType int |
| 10 | + |
| 11 | +const ( |
| 12 | + TokenEOF TokenType = iota |
| 13 | + TokenError |
| 14 | + TokenIdent |
| 15 | + TokenNumber |
| 16 | + TokenString |
| 17 | + TokenStar |
| 18 | + TokenComma |
| 19 | + TokenDot |
| 20 | + TokenLParen |
| 21 | + TokenRParen |
| 22 | + TokenLBracket |
| 23 | + TokenRBracket |
| 24 | + TokenSemicolon |
| 25 | + TokenEquals |
| 26 | + TokenLessThan |
| 27 | + TokenGreaterThan |
| 28 | + TokenPlus |
| 29 | + TokenMinus |
| 30 | + |
| 31 | + // Keywords |
| 32 | + TokenSelect |
| 33 | + TokenFrom |
| 34 | + TokenWhere |
| 35 | + TokenAnd |
| 36 | + TokenOr |
| 37 | + TokenAs |
| 38 | + TokenOption |
| 39 | + TokenAll |
| 40 | + TokenDistinct |
| 41 | +) |
| 42 | + |
| 43 | +// Token represents a lexical token. |
| 44 | +type Token struct { |
| 45 | + Type TokenType |
| 46 | + Literal string |
| 47 | + Pos int |
| 48 | +} |
| 49 | + |
| 50 | +// Lexer tokenizes T-SQL input. |
| 51 | +type Lexer struct { |
| 52 | + input string |
| 53 | + pos int |
| 54 | + readPos int |
| 55 | + ch byte |
| 56 | +} |
| 57 | + |
| 58 | +// NewLexer creates a new Lexer for the given input. |
| 59 | +func NewLexer(input string) *Lexer { |
| 60 | + l := &Lexer{input: input} |
| 61 | + l.readChar() |
| 62 | + return l |
| 63 | +} |
| 64 | + |
| 65 | +func (l *Lexer) readChar() { |
| 66 | + if l.readPos >= len(l.input) { |
| 67 | + l.ch = 0 |
| 68 | + } else { |
| 69 | + l.ch = l.input[l.readPos] |
| 70 | + } |
| 71 | + l.pos = l.readPos |
| 72 | + l.readPos++ |
| 73 | +} |
| 74 | + |
| 75 | +func (l *Lexer) peekChar() byte { |
| 76 | + if l.readPos >= len(l.input) { |
| 77 | + return 0 |
| 78 | + } |
| 79 | + return l.input[l.readPos] |
| 80 | +} |
| 81 | + |
| 82 | +// NextToken returns the next token from the input. |
| 83 | +func (l *Lexer) NextToken() Token { |
| 84 | + l.skipWhitespaceAndComments() |
| 85 | + |
| 86 | + tok := Token{Pos: l.pos} |
| 87 | + |
| 88 | + switch l.ch { |
| 89 | + case 0: |
| 90 | + tok.Type = TokenEOF |
| 91 | + tok.Literal = "" |
| 92 | + case '*': |
| 93 | + tok.Type = TokenStar |
| 94 | + tok.Literal = "*" |
| 95 | + l.readChar() |
| 96 | + case ',': |
| 97 | + tok.Type = TokenComma |
| 98 | + tok.Literal = "," |
| 99 | + l.readChar() |
| 100 | + case '.': |
| 101 | + tok.Type = TokenDot |
| 102 | + tok.Literal = "." |
| 103 | + l.readChar() |
| 104 | + case '(': |
| 105 | + tok.Type = TokenLParen |
| 106 | + tok.Literal = "(" |
| 107 | + l.readChar() |
| 108 | + case ')': |
| 109 | + tok.Type = TokenRParen |
| 110 | + tok.Literal = ")" |
| 111 | + l.readChar() |
| 112 | + case '[': |
| 113 | + tok = l.readBracketedIdentifier() |
| 114 | + case ']': |
| 115 | + tok.Type = TokenRBracket |
| 116 | + tok.Literal = "]" |
| 117 | + l.readChar() |
| 118 | + case ';': |
| 119 | + tok.Type = TokenSemicolon |
| 120 | + tok.Literal = ";" |
| 121 | + l.readChar() |
| 122 | + case '=': |
| 123 | + tok.Type = TokenEquals |
| 124 | + tok.Literal = "=" |
| 125 | + l.readChar() |
| 126 | + case '<': |
| 127 | + tok.Type = TokenLessThan |
| 128 | + tok.Literal = "<" |
| 129 | + l.readChar() |
| 130 | + case '>': |
| 131 | + tok.Type = TokenGreaterThan |
| 132 | + tok.Literal = ">" |
| 133 | + l.readChar() |
| 134 | + case '+': |
| 135 | + tok.Type = TokenPlus |
| 136 | + tok.Literal = "+" |
| 137 | + l.readChar() |
| 138 | + case '-': |
| 139 | + tok.Type = TokenMinus |
| 140 | + tok.Literal = "-" |
| 141 | + l.readChar() |
| 142 | + case '\'': |
| 143 | + tok = l.readString() |
| 144 | + default: |
| 145 | + if isLetter(l.ch) || l.ch == '_' || l.ch == '@' || l.ch == '#' { |
| 146 | + tok = l.readIdentifier() |
| 147 | + } else if isDigit(l.ch) { |
| 148 | + tok = l.readNumber() |
| 149 | + } else { |
| 150 | + tok.Type = TokenError |
| 151 | + tok.Literal = string(l.ch) |
| 152 | + l.readChar() |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + return tok |
| 157 | +} |
| 158 | + |
| 159 | +func (l *Lexer) skipWhitespaceAndComments() { |
| 160 | + for { |
| 161 | + // Skip whitespace |
| 162 | + for l.ch != 0 && (l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r') { |
| 163 | + l.readChar() |
| 164 | + } |
| 165 | + |
| 166 | + // Skip line comments (-- ...) |
| 167 | + if l.ch == '-' && l.peekChar() == '-' { |
| 168 | + for l.ch != 0 && l.ch != '\n' { |
| 169 | + l.readChar() |
| 170 | + } |
| 171 | + continue |
| 172 | + } |
| 173 | + |
| 174 | + // Skip block comments (/* ... */) |
| 175 | + if l.ch == '/' && l.peekChar() == '*' { |
| 176 | + l.readChar() // skip / |
| 177 | + l.readChar() // skip * |
| 178 | + for l.ch != 0 { |
| 179 | + if l.ch == '*' && l.peekChar() == '/' { |
| 180 | + l.readChar() // skip * |
| 181 | + l.readChar() // skip / |
| 182 | + break |
| 183 | + } |
| 184 | + l.readChar() |
| 185 | + } |
| 186 | + continue |
| 187 | + } |
| 188 | + |
| 189 | + break |
| 190 | + } |
| 191 | +} |
| 192 | + |
| 193 | +func (l *Lexer) readIdentifier() Token { |
| 194 | + startPos := l.pos |
| 195 | + for isLetter(l.ch) || isDigit(l.ch) || l.ch == '_' || l.ch == '@' || l.ch == '#' { |
| 196 | + l.readChar() |
| 197 | + } |
| 198 | + literal := l.input[startPos:l.pos] |
| 199 | + return Token{ |
| 200 | + Type: lookupKeyword(literal), |
| 201 | + Literal: literal, |
| 202 | + Pos: startPos, |
| 203 | + } |
| 204 | +} |
| 205 | + |
| 206 | +func (l *Lexer) readBracketedIdentifier() Token { |
| 207 | + startPos := l.pos |
| 208 | + l.readChar() // skip opening [ |
| 209 | + for l.ch != 0 && l.ch != ']' { |
| 210 | + l.readChar() |
| 211 | + } |
| 212 | + if l.ch == ']' { |
| 213 | + l.readChar() // skip closing ] |
| 214 | + } |
| 215 | + return Token{ |
| 216 | + Type: TokenIdent, |
| 217 | + Literal: l.input[startPos:l.pos], |
| 218 | + Pos: startPos, |
| 219 | + } |
| 220 | +} |
| 221 | + |
| 222 | +func (l *Lexer) readString() Token { |
| 223 | + startPos := l.pos |
| 224 | + l.readChar() // skip opening quote |
| 225 | + for l.ch != 0 { |
| 226 | + if l.ch == '\'' { |
| 227 | + if l.peekChar() == '\'' { |
| 228 | + // Escaped quote |
| 229 | + l.readChar() |
| 230 | + l.readChar() |
| 231 | + continue |
| 232 | + } |
| 233 | + break |
| 234 | + } |
| 235 | + l.readChar() |
| 236 | + } |
| 237 | + if l.ch == '\'' { |
| 238 | + l.readChar() // skip closing quote |
| 239 | + } |
| 240 | + return Token{ |
| 241 | + Type: TokenString, |
| 242 | + Literal: l.input[startPos:l.pos], |
| 243 | + Pos: startPos, |
| 244 | + } |
| 245 | +} |
| 246 | + |
| 247 | +func (l *Lexer) readNumber() Token { |
| 248 | + startPos := l.pos |
| 249 | + for isDigit(l.ch) { |
| 250 | + l.readChar() |
| 251 | + } |
| 252 | + // Handle decimal point |
| 253 | + if l.ch == '.' && isDigit(l.peekChar()) { |
| 254 | + l.readChar() |
| 255 | + for isDigit(l.ch) { |
| 256 | + l.readChar() |
| 257 | + } |
| 258 | + } |
| 259 | + return Token{ |
| 260 | + Type: TokenNumber, |
| 261 | + Literal: l.input[startPos:l.pos], |
| 262 | + Pos: startPos, |
| 263 | + } |
| 264 | +} |
| 265 | + |
| 266 | +func isLetter(ch byte) bool { |
| 267 | + return unicode.IsLetter(rune(ch)) |
| 268 | +} |
| 269 | + |
| 270 | +func isDigit(ch byte) bool { |
| 271 | + return ch >= '0' && ch <= '9' |
| 272 | +} |
| 273 | + |
| 274 | +var keywords = map[string]TokenType{ |
| 275 | + "SELECT": TokenSelect, |
| 276 | + "FROM": TokenFrom, |
| 277 | + "WHERE": TokenWhere, |
| 278 | + "AND": TokenAnd, |
| 279 | + "OR": TokenOr, |
| 280 | + "AS": TokenAs, |
| 281 | + "OPTION": TokenOption, |
| 282 | + "ALL": TokenAll, |
| 283 | + "DISTINCT": TokenDistinct, |
| 284 | +} |
| 285 | + |
| 286 | +func lookupKeyword(ident string) TokenType { |
| 287 | + if tok, ok := keywords[strings.ToUpper(ident)]; ok { |
| 288 | + return tok |
| 289 | + } |
| 290 | + return TokenIdent |
| 291 | +} |
0 commit comments