package tokenizer import ( "regexp" "strings" ) type Token struct { Type string `json:"type"` // "keyword", "string", "comment", etc. Content string `json:"content"` // actual text Start int `json:"start"` // character position End int `json:"end"` // character position } type TokenizedLine struct { LineNumber int `json:"lineNumber"` Tokens []Token `json:"tokens"` Raw string `json:"raw"` // original line content } type TokenizedDocument struct { Language string `json:"language"` Lines []TokenizedLine `json:"lines"` Metadata DocumentMeta `json:"metadata"` } type DocumentMeta struct { TotalLines int `json:"totalLines"` FileSize int64 `json:"fileSize"` Language string `json:"language"` } type Tokenizer struct { languages map[string]*Language } type Language struct { Name string Rules []Rule } type Rule struct { Pattern *regexp.Regexp Token string Priority int // higher priority rules checked first } func NewTokenizer() *Tokenizer { t := &Tokenizer{ languages: make(map[string]*Language), } // Initialize with basic languages t.initializeLanguages() return t } func (t *Tokenizer) TokenizeDocument(content, language string) *TokenizedDocument { lines := strings.Split(content, "\n") tokenizedLines := make([]TokenizedLine, len(lines)) lang := t.languages[language] if lang == nil { lang = t.languages["text"] // fallback } for i, line := range lines { tokenizedLines[i] = TokenizedLine{ LineNumber: i + 1, Tokens: t.tokenizeLine(line, lang), Raw: line, } } return &TokenizedDocument{ Language: language, Lines: tokenizedLines, Metadata: DocumentMeta{ TotalLines: len(lines), FileSize: int64(len(content)), Language: language, }, } } func (t *Tokenizer) tokenizeLine(line string, lang *Language) []Token { if len(line) == 0 { return []Token{} } var tokens []Token remaining := line offset := 0 for len(remaining) > 0 { matched := false // Try each rule in priority order for _, rule := range lang.Rules { if loc := rule.Pattern.FindStringIndex(remaining); loc != nil && loc[0] == 0 { // Found a match at the beginning matchText := remaining[loc[0]:loc[1]] tokens = append(tokens, Token{ Type: rule.Token, Content: matchText, Start: offset, End: offset + len(matchText), }) remaining = remaining[loc[1]:] offset += loc[1] matched = true break } } if !matched { // No rule matched, skip one character remaining = remaining[1:] offset++ } } return tokens }