126 lines
2.6 KiB
Go
126 lines
2.6 KiB
Go
package tokenizer
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
type Token struct {
|
|
Type string `json:"type"` // "keyword", "string", "comment", etc.
|
|
Content string `json:"content"` // actual text
|
|
Start int `json:"start"` // character position
|
|
End int `json:"end"` // character position
|
|
}
|
|
|
|
type TokenizedLine struct {
|
|
LineNumber int `json:"lineNumber"`
|
|
Tokens []Token `json:"tokens"`
|
|
Raw string `json:"raw"` // original line content
|
|
}
|
|
|
|
type TokenizedDocument struct {
|
|
Language string `json:"language"`
|
|
Lines []TokenizedLine `json:"lines"`
|
|
Metadata DocumentMeta `json:"metadata"`
|
|
}
|
|
|
|
type DocumentMeta struct {
|
|
TotalLines int `json:"totalLines"`
|
|
FileSize int64 `json:"fileSize"`
|
|
Language string `json:"language"`
|
|
}
|
|
|
|
type Tokenizer struct {
|
|
languages map[string]*Language
|
|
}
|
|
|
|
type Language struct {
|
|
Name string
|
|
Rules []Rule
|
|
}
|
|
|
|
type Rule struct {
|
|
Pattern *regexp.Regexp
|
|
Token string
|
|
Priority int // higher priority rules checked first
|
|
}
|
|
|
|
func NewTokenizer() *Tokenizer {
|
|
t := &Tokenizer{
|
|
languages: make(map[string]*Language),
|
|
}
|
|
|
|
// Initialize with basic languages
|
|
t.initializeLanguages()
|
|
return t
|
|
}
|
|
|
|
func (t *Tokenizer) TokenizeDocument(content, language string) *TokenizedDocument {
|
|
lines := strings.Split(content, "\n")
|
|
tokenizedLines := make([]TokenizedLine, len(lines))
|
|
|
|
lang := t.languages[language]
|
|
if lang == nil {
|
|
lang = t.languages["text"] // fallback
|
|
}
|
|
|
|
for i, line := range lines {
|
|
tokenizedLines[i] = TokenizedLine{
|
|
LineNumber: i + 1,
|
|
Tokens: t.tokenizeLine(line, lang),
|
|
Raw: line,
|
|
}
|
|
}
|
|
|
|
return &TokenizedDocument{
|
|
Language: language,
|
|
Lines: tokenizedLines,
|
|
Metadata: DocumentMeta{
|
|
TotalLines: len(lines),
|
|
FileSize: int64(len(content)),
|
|
Language: language,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (t *Tokenizer) tokenizeLine(line string, lang *Language) []Token {
|
|
if len(line) == 0 {
|
|
return []Token{}
|
|
}
|
|
|
|
var tokens []Token
|
|
remaining := line
|
|
offset := 0
|
|
|
|
for len(remaining) > 0 {
|
|
matched := false
|
|
|
|
// Try each rule in priority order
|
|
for _, rule := range lang.Rules {
|
|
if loc := rule.Pattern.FindStringIndex(remaining); loc != nil && loc[0] == 0 {
|
|
// Found a match at the beginning
|
|
matchText := remaining[loc[0]:loc[1]]
|
|
tokens = append(tokens, Token{
|
|
Type: rule.Token,
|
|
Content: matchText,
|
|
Start: offset,
|
|
End: offset + len(matchText),
|
|
})
|
|
|
|
remaining = remaining[loc[1]:]
|
|
offset += loc[1]
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !matched {
|
|
// No rule matched, skip one character
|
|
remaining = remaining[1:]
|
|
offset++
|
|
}
|
|
}
|
|
|
|
return tokens
|
|
}
|