Aided stab at basic tokenizer and api setup.

This commit is contained in:
2025-12-19 18:46:03 -07:00
parent 1e90cbbb39
commit 7686b1b9f0
9 changed files with 446 additions and 1 deletions

69
internal/api/handlers.go Normal file
View File

@@ -0,0 +1,69 @@
package api
import (
"encoding/json"
"io/ioutil"
"net/http"
"path/filepath"
"strconv"
"github.com/smjklake/glancr/internal/tokenizer"
)
type API struct {
tokenizer *tokenizer.Tokenizer
}
func NewAPI() *API {
return &API{
tokenizer: tokenizer.NewTokenizer(),
}
}
// GET /api/file?path=example.go&start=1&end=100
func (a *API) HandleFile(w http.ResponseWriter, r *http.Request) {
filePath := r.URL.Query().Get("path")
startLine := r.URL.Query().Get("start")
endLine := r.URL.Query().Get("end")
// Read file
content, err := ioutil.ReadFile(filePath)
if err != nil {
http.Error(w, "File not found", http.StatusNotFound)
return
}
// Detect language from extension
language := detectLanguage(filepath.Ext(filePath))
// Tokenize
doc := a.tokenizer.TokenizeDocument(string(content), language)
// Handle range requests for virtual scrolling
if startLine != "" && endLine != "" {
start, _ := strconv.Atoi(startLine)
end, _ := strconv.Atoi(endLine)
if start > 0 && end <= len(doc.Lines) {
doc.Lines = doc.Lines[start-1 : end]
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(doc)
}
func detectLanguage(ext string) string {
switch ext {
case ".go":
return "go"
case ".js", ".jsx":
return "javascript"
case ".ts", ".tsx":
return "javascript" // Can add typescript later
case ".md":
return "markdown"
default:
return "text"
}
}

View File

@@ -0,0 +1,106 @@
package tokenizer
import "regexp"
func (t *Tokenizer) initializeLanguages() {
t.addGoLanguage()
t.addJavaScriptLanguage()
t.addMarkdownLanguage()
t.addTextLanguage()
}
func (t *Tokenizer) addGoLanguage() {
t.languages["go"] = &Language{
Name: "go",
Rules: []Rule{
// Comments
{regexp.MustCompile(`//.*$`), "comment", 100},
{regexp.MustCompile(`/\*[\s\S]*?\*/`), "comment", 100},
// Strings
{regexp.MustCompile(`"(?:[^"\\]|\\.)*"`), "string", 90},
{regexp.MustCompile("`[^`]*`"), "string", 90},
// Keywords
{regexp.MustCompile(`\b(package|import|func|var|const|type|struct|interface|if|else|for|range|switch|case|default|return|break|continue|go|defer|chan|select)\b`), "keyword", 80},
// Types
{regexp.MustCompile(`\b(int|int8|int16|int32|int64|uint|uint8|uint16|uint32|uint64|float32|float64|string|bool|byte|rune|error)\b`), "type", 70},
// Numbers
{regexp.MustCompile(`\b\d+(\.\d+)?\b`), "number", 60},
// Functions
{regexp.MustCompile(`\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`), "function", 50},
// Default
{regexp.MustCompile(`[a-zA-Z_][a-zA-Z0-9_]*`), "identifier", 10},
},
}
}
func (t *Tokenizer) addMarkdownLanguage() {
t.languages["markdown"] = &Language{
Name: "markdown",
Rules: []Rule{
// Headers
{regexp.MustCompile(`^#{1,6}\s+.*$`), "header", 100},
// Code blocks
{regexp.MustCompile("```[\\s\\S]*?```"), "code-block", 95},
{regexp.MustCompile("`[^`]+`"), "code-inline", 90},
// Bold/Italic
{regexp.MustCompile(`\*\*[^*]+\*\*`), "bold", 85},
{regexp.MustCompile(`\*[^*]+\*`), "italic", 80},
// Links
{regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`), "link", 75},
// Lists
{regexp.MustCompile(`^\s*[-*+]\s+`), "list-marker", 70},
{regexp.MustCompile(`^\s*\d+\.\s+`), "list-marker", 70},
},
}
}
func (t *Tokenizer) addJavaScriptLanguage() {
t.languages["javascript"] = &Language{
Name: "javascript",
Rules: []Rule{
// Comments
{regexp.MustCompile(`//.*$`), "comment", 100},
{regexp.MustCompile(`/\*[\s\S]*?\*/`), "comment", 100},
// Strings
{regexp.MustCompile(`"(?:[^"\\]|\\.)*"`), "string", 90},
{regexp.MustCompile(`'(?:[^'\\]|\\.)*'`), "string", 90},
{regexp.MustCompile("`(?:[^`\\]|\\.)*`"), "string", 90},
// Keywords
{regexp.MustCompile(`\b(const|let|var|function|return|if|else|for|while|do|switch|case|default|break|continue|try|catch|finally|throw|new|this|class|extends|import|export|from|async|await)\b`), "keyword", 80},
// Types/Built-ins
{regexp.MustCompile(`\b(Object|Array|String|Number|Boolean|Date|RegExp|Error|Promise)\b`), "type", 70},
// Numbers
{regexp.MustCompile(`\b\d+(\.\d+)?\b`), "number", 60},
// Functions
{regexp.MustCompile(`\b([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\(`), "function", 50},
// Default
{regexp.MustCompile(`[a-zA-Z_$][a-zA-Z0-9_$]*`), "identifier", 10},
},
}
}
func (t *Tokenizer) addTextLanguage() {
t.languages["text"] = &Language{
Name: "text",
Rules: []Rule{
// Just return everything as text
{regexp.MustCompile(`.+`), "text", 1},
},
}
}

View File

@@ -0,0 +1,125 @@
package tokenizer
import (
"regexp"
"strings"
)
type Token struct {
Type string `json:"type"` // "keyword", "string", "comment", etc.
Content string `json:"content"` // actual text
Start int `json:"start"` // character position
End int `json:"end"` // character position
}
type TokenizedLine struct {
LineNumber int `json:"lineNumber"`
Tokens []Token `json:"tokens"`
Raw string `json:"raw"` // original line content
}
type TokenizedDocument struct {
Language string `json:"language"`
Lines []TokenizedLine `json:"lines"`
Metadata DocumentMeta `json:"metadata"`
}
type DocumentMeta struct {
TotalLines int `json:"totalLines"`
FileSize int64 `json:"fileSize"`
Language string `json:"language"`
}
type Tokenizer struct {
languages map[string]*Language
}
type Language struct {
Name string
Rules []Rule
}
type Rule struct {
Pattern *regexp.Regexp
Token string
Priority int // higher priority rules checked first
}
func NewTokenizer() *Tokenizer {
t := &Tokenizer{
languages: make(map[string]*Language),
}
// Initialize with basic languages
t.initializeLanguages()
return t
}
func (t *Tokenizer) TokenizeDocument(content, language string) *TokenizedDocument {
lines := strings.Split(content, "\n")
tokenizedLines := make([]TokenizedLine, len(lines))
lang := t.languages[language]
if lang == nil {
lang = t.languages["text"] // fallback
}
for i, line := range lines {
tokenizedLines[i] = TokenizedLine{
LineNumber: i + 1,
Tokens: t.tokenizeLine(line, lang),
Raw: line,
}
}
return &TokenizedDocument{
Language: language,
Lines: tokenizedLines,
Metadata: DocumentMeta{
TotalLines: len(lines),
FileSize: int64(len(content)),
Language: language,
},
}
}
func (t *Tokenizer) tokenizeLine(line string, lang *Language) []Token {
if len(line) == 0 {
return []Token{}
}
var tokens []Token
remaining := line
offset := 0
for len(remaining) > 0 {
matched := false
// Try each rule in priority order
for _, rule := range lang.Rules {
if loc := rule.Pattern.FindStringIndex(remaining); loc != nil && loc[0] == 0 {
// Found a match at the beginning
matchText := remaining[loc[0]:loc[1]]
tokens = append(tokens, Token{
Type: rule.Token,
Content: matchText,
Start: offset,
End: offset + len(matchText),
})
remaining = remaining[loc[1]:]
offset += loc[1]
matched = true
break
}
}
if !matched {
// No rule matched, skip one character
remaining = remaining[1:]
offset++
}
}
return tokens
}