Files
glancr/internal/tokenizer/tokenizer.go

126 lines
2.6 KiB
Go

package tokenizer
import (
"regexp"
"strings"
)
type Token struct {
Type string `json:"type"` // "keyword", "string", "comment", etc.
Content string `json:"content"` // actual text
Start int `json:"start"` // character position
End int `json:"end"` // character position
}
type TokenizedLine struct {
LineNumber int `json:"lineNumber"`
Tokens []Token `json:"tokens"`
Raw string `json:"raw"` // original line content
}
type TokenizedDocument struct {
Language string `json:"language"`
Lines []TokenizedLine `json:"lines"`
Metadata DocumentMeta `json:"metadata"`
}
type DocumentMeta struct {
TotalLines int `json:"totalLines"`
FileSize int64 `json:"fileSize"`
Language string `json:"language"`
}
type Tokenizer struct {
languages map[string]*Language
}
type Language struct {
Name string
Rules []Rule
}
type Rule struct {
Pattern *regexp.Regexp
Token string
Priority int // higher priority rules checked first
}
func NewTokenizer() *Tokenizer {
t := &Tokenizer{
languages: make(map[string]*Language),
}
// Initialize with basic languages
t.initializeLanguages()
return t
}
func (t *Tokenizer) TokenizeDocument(content, language string) *TokenizedDocument {
lines := strings.Split(content, "\n")
tokenizedLines := make([]TokenizedLine, len(lines))
lang := t.languages[language]
if lang == nil {
lang = t.languages["text"] // fallback
}
for i, line := range lines {
tokenizedLines[i] = TokenizedLine{
LineNumber: i + 1,
Tokens: t.tokenizeLine(line, lang),
Raw: line,
}
}
return &TokenizedDocument{
Language: language,
Lines: tokenizedLines,
Metadata: DocumentMeta{
TotalLines: len(lines),
FileSize: int64(len(content)),
Language: language,
},
}
}
func (t *Tokenizer) tokenizeLine(line string, lang *Language) []Token {
if len(line) == 0 {
return []Token{}
}
var tokens []Token
remaining := line
offset := 0
for len(remaining) > 0 {
matched := false
// Try each rule in priority order
for _, rule := range lang.Rules {
if loc := rule.Pattern.FindStringIndex(remaining); loc != nil && loc[0] == 0 {
// Found a match at the beginning
matchText := remaining[loc[0]:loc[1]]
tokens = append(tokens, Token{
Type: rule.Token,
Content: matchText,
Start: offset,
End: offset + len(matchText),
})
remaining = remaining[loc[1]:]
offset += loc[1]
matched = true
break
}
}
if !matched {
// No rule matched, skip one character
remaining = remaining[1:]
offset++
}
}
return tokens
}