Aided stab at basic tokenizer and api setup.
This commit is contained in:
106
internal/tokenizer/languages.go
Normal file
106
internal/tokenizer/languages.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package tokenizer
|
||||
|
||||
import "regexp"
|
||||
|
||||
func (t *Tokenizer) initializeLanguages() {
|
||||
t.addGoLanguage()
|
||||
t.addJavaScriptLanguage()
|
||||
t.addMarkdownLanguage()
|
||||
t.addTextLanguage()
|
||||
}
|
||||
|
||||
func (t *Tokenizer) addGoLanguage() {
|
||||
t.languages["go"] = &Language{
|
||||
Name: "go",
|
||||
Rules: []Rule{
|
||||
// Comments
|
||||
{regexp.MustCompile(`//.*$`), "comment", 100},
|
||||
{regexp.MustCompile(`/\*[\s\S]*?\*/`), "comment", 100},
|
||||
|
||||
// Strings
|
||||
{regexp.MustCompile(`"(?:[^"\\]|\\.)*"`), "string", 90},
|
||||
{regexp.MustCompile("`[^`]*`"), "string", 90},
|
||||
|
||||
// Keywords
|
||||
{regexp.MustCompile(`\b(package|import|func|var|const|type|struct|interface|if|else|for|range|switch|case|default|return|break|continue|go|defer|chan|select)\b`), "keyword", 80},
|
||||
|
||||
// Types
|
||||
{regexp.MustCompile(`\b(int|int8|int16|int32|int64|uint|uint8|uint16|uint32|uint64|float32|float64|string|bool|byte|rune|error)\b`), "type", 70},
|
||||
|
||||
// Numbers
|
||||
{regexp.MustCompile(`\b\d+(\.\d+)?\b`), "number", 60},
|
||||
|
||||
// Functions
|
||||
{regexp.MustCompile(`\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`), "function", 50},
|
||||
|
||||
// Default
|
||||
{regexp.MustCompile(`[a-zA-Z_][a-zA-Z0-9_]*`), "identifier", 10},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tokenizer) addMarkdownLanguage() {
|
||||
t.languages["markdown"] = &Language{
|
||||
Name: "markdown",
|
||||
Rules: []Rule{
|
||||
// Headers
|
||||
{regexp.MustCompile(`^#{1,6}\s+.*$`), "header", 100},
|
||||
|
||||
// Code blocks
|
||||
{regexp.MustCompile("```[\\s\\S]*?```"), "code-block", 95},
|
||||
{regexp.MustCompile("`[^`]+`"), "code-inline", 90},
|
||||
|
||||
// Bold/Italic
|
||||
{regexp.MustCompile(`\*\*[^*]+\*\*`), "bold", 85},
|
||||
{regexp.MustCompile(`\*[^*]+\*`), "italic", 80},
|
||||
|
||||
// Links
|
||||
{regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`), "link", 75},
|
||||
|
||||
// Lists
|
||||
{regexp.MustCompile(`^\s*[-*+]\s+`), "list-marker", 70},
|
||||
{regexp.MustCompile(`^\s*\d+\.\s+`), "list-marker", 70},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tokenizer) addJavaScriptLanguage() {
|
||||
t.languages["javascript"] = &Language{
|
||||
Name: "javascript",
|
||||
Rules: []Rule{
|
||||
// Comments
|
||||
{regexp.MustCompile(`//.*$`), "comment", 100},
|
||||
{regexp.MustCompile(`/\*[\s\S]*?\*/`), "comment", 100},
|
||||
|
||||
// Strings
|
||||
{regexp.MustCompile(`"(?:[^"\\]|\\.)*"`), "string", 90},
|
||||
{regexp.MustCompile(`'(?:[^'\\]|\\.)*'`), "string", 90},
|
||||
{regexp.MustCompile("`(?:[^`\\]|\\.)*`"), "string", 90},
|
||||
|
||||
// Keywords
|
||||
{regexp.MustCompile(`\b(const|let|var|function|return|if|else|for|while|do|switch|case|default|break|continue|try|catch|finally|throw|new|this|class|extends|import|export|from|async|await)\b`), "keyword", 80},
|
||||
|
||||
// Types/Built-ins
|
||||
{regexp.MustCompile(`\b(Object|Array|String|Number|Boolean|Date|RegExp|Error|Promise)\b`), "type", 70},
|
||||
|
||||
// Numbers
|
||||
{regexp.MustCompile(`\b\d+(\.\d+)?\b`), "number", 60},
|
||||
|
||||
// Functions
|
||||
{regexp.MustCompile(`\b([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\(`), "function", 50},
|
||||
|
||||
// Default
|
||||
{regexp.MustCompile(`[a-zA-Z_$][a-zA-Z0-9_$]*`), "identifier", 10},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tokenizer) addTextLanguage() {
|
||||
t.languages["text"] = &Language{
|
||||
Name: "text",
|
||||
Rules: []Rule{
|
||||
// Just return everything as text
|
||||
{regexp.MustCompile(`.+`), "text", 1},
|
||||
},
|
||||
}
|
||||
}
|
||||
125
internal/tokenizer/tokenizer.go
Normal file
125
internal/tokenizer/tokenizer.go
Normal file
@@ -0,0 +1,125 @@
|
||||
package tokenizer
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
Type string `json:"type"` // "keyword", "string", "comment", etc.
|
||||
Content string `json:"content"` // actual text
|
||||
Start int `json:"start"` // character position
|
||||
End int `json:"end"` // character position
|
||||
}
|
||||
|
||||
type TokenizedLine struct {
|
||||
LineNumber int `json:"lineNumber"`
|
||||
Tokens []Token `json:"tokens"`
|
||||
Raw string `json:"raw"` // original line content
|
||||
}
|
||||
|
||||
type TokenizedDocument struct {
|
||||
Language string `json:"language"`
|
||||
Lines []TokenizedLine `json:"lines"`
|
||||
Metadata DocumentMeta `json:"metadata"`
|
||||
}
|
||||
|
||||
type DocumentMeta struct {
|
||||
TotalLines int `json:"totalLines"`
|
||||
FileSize int64 `json:"fileSize"`
|
||||
Language string `json:"language"`
|
||||
}
|
||||
|
||||
type Tokenizer struct {
|
||||
languages map[string]*Language
|
||||
}
|
||||
|
||||
type Language struct {
|
||||
Name string
|
||||
Rules []Rule
|
||||
}
|
||||
|
||||
type Rule struct {
|
||||
Pattern *regexp.Regexp
|
||||
Token string
|
||||
Priority int // higher priority rules checked first
|
||||
}
|
||||
|
||||
func NewTokenizer() *Tokenizer {
|
||||
t := &Tokenizer{
|
||||
languages: make(map[string]*Language),
|
||||
}
|
||||
|
||||
// Initialize with basic languages
|
||||
t.initializeLanguages()
|
||||
return t
|
||||
}
|
||||
|
||||
func (t *Tokenizer) TokenizeDocument(content, language string) *TokenizedDocument {
|
||||
lines := strings.Split(content, "\n")
|
||||
tokenizedLines := make([]TokenizedLine, len(lines))
|
||||
|
||||
lang := t.languages[language]
|
||||
if lang == nil {
|
||||
lang = t.languages["text"] // fallback
|
||||
}
|
||||
|
||||
for i, line := range lines {
|
||||
tokenizedLines[i] = TokenizedLine{
|
||||
LineNumber: i + 1,
|
||||
Tokens: t.tokenizeLine(line, lang),
|
||||
Raw: line,
|
||||
}
|
||||
}
|
||||
|
||||
return &TokenizedDocument{
|
||||
Language: language,
|
||||
Lines: tokenizedLines,
|
||||
Metadata: DocumentMeta{
|
||||
TotalLines: len(lines),
|
||||
FileSize: int64(len(content)),
|
||||
Language: language,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tokenizer) tokenizeLine(line string, lang *Language) []Token {
|
||||
if len(line) == 0 {
|
||||
return []Token{}
|
||||
}
|
||||
|
||||
var tokens []Token
|
||||
remaining := line
|
||||
offset := 0
|
||||
|
||||
for len(remaining) > 0 {
|
||||
matched := false
|
||||
|
||||
// Try each rule in priority order
|
||||
for _, rule := range lang.Rules {
|
||||
if loc := rule.Pattern.FindStringIndex(remaining); loc != nil && loc[0] == 0 {
|
||||
// Found a match at the beginning
|
||||
matchText := remaining[loc[0]:loc[1]]
|
||||
tokens = append(tokens, Token{
|
||||
Type: rule.Token,
|
||||
Content: matchText,
|
||||
Start: offset,
|
||||
End: offset + len(matchText),
|
||||
})
|
||||
|
||||
remaining = remaining[loc[1]:]
|
||||
offset += loc[1]
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !matched {
|
||||
// No rule matched, skip one character
|
||||
remaining = remaining[1:]
|
||||
offset++
|
||||
}
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
Reference in New Issue
Block a user