aboutsummaryrefslogtreecommitdiff
path: root/tokenizer/tokenizer.go
diff options
context:
space:
mode:
authorJuan J. Martinez <jjm@usebox.net>2022-07-18 07:45:58 +0100
committerJuan J. Martinez <jjm@usebox.net>2022-07-18 07:45:58 +0100
commit8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642 (patch)
treec53977d1284347bb1d5963ddb4dc7723c40c6e55 /tokenizer/tokenizer.go
downloadmicro-lang-8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642.tar.gz
micro-lang-8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642.zip
First public release
Diffstat (limited to 'tokenizer/tokenizer.go')
-rw-r--r--tokenizer/tokenizer.go356
1 files changed, 356 insertions, 0 deletions
diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go
new file mode 100644
index 0000000..78616c6
--- /dev/null
+++ b/tokenizer/tokenizer.go
@@ -0,0 +1,356 @@
+package tokenizer
+
+import (
+ "bufio"
+ goerr "errors"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+ "unicode"
+
+ "usebox.net/lang/errors"
+ "usebox.net/lang/tokens"
+)
+
+type rTokenIdMap map[rune]tokens.TokenId
+type sTokenIdMap map[string]tokens.TokenId
+
+var (
+ keywords = sTokenIdMap{
+ "true": tokens.True,
+ "false": tokens.False,
+ "var": tokens.Var,
+ "const": tokens.Const,
+ "def": tokens.Def,
+ "return": tokens.Return,
+ "number": tokens.TNumber,
+ "bool": tokens.TBool,
+ "string": tokens.TString,
+ "func": tokens.TFunc,
+ "if": tokens.If,
+ "else": tokens.Else,
+ "for": tokens.For,
+ "in": tokens.In,
+ "continue": tokens.Continue,
+ "break": tokens.Break,
+ }
+
+ doubleChar = map[rune]struct {
+ second rTokenIdMap
+ }{
+ '|': {rTokenIdMap{'|': tokens.Or}},
+ '&': {rTokenIdMap{'&': tokens.And}},
+ '=': {rTokenIdMap{'=': tokens.Eq}},
+ '!': {rTokenIdMap{'=': tokens.Ne, '?': tokens.TagE}},
+ '>': {rTokenIdMap{'=': tokens.Ge, '>': tokens.BitShr}},
+ '<': {rTokenIdMap{'=': tokens.Le, '<': tokens.BitShl}},
+ }
+
+ singleChar = rTokenIdMap{
+ '(': tokens.LParen,
+ ')': tokens.RParen,
+ '+': tokens.Add,
+ '-': tokens.Sub,
+ '*': tokens.Mul,
+ '%': tokens.Mod,
+ '/': tokens.Div,
+ '>': tokens.Gt,
+ '<': tokens.Lt,
+ '!': tokens.Not,
+ '~': tokens.Neg,
+ '|': tokens.BitOr,
+ '&': tokens.BitAnd,
+ '^': tokens.BitXor,
+ '=': tokens.Assign,
+ '.': tokens.Dot,
+ ';': tokens.Semicolon,
+ ',': tokens.Comma,
+ '{': tokens.LBrace,
+ '}': tokens.RBrace,
+ '[': tokens.LBracket,
+ ']': tokens.RBracket,
+ '?': tokens.TestE,
+ }
+)
+
+type Tokenizer struct {
+ input bufio.Reader
+
+ loc tokens.Location
+}
+
+func NewTokenizer(filename string, input io.Reader) *Tokenizer {
+ return &Tokenizer{input: *bufio.NewReader(input), loc: tokens.Location{File: filename, Line: 1, Column: 1}}
+}
+
+func (t *Tokenizer) peek() (rune, error) {
+ r, _, err := t.input.ReadRune()
+ if err != nil {
+ t.input.UnreadRune()
+ return rune(0), err
+ }
+ if err := t.input.UnreadRune(); err != nil {
+ return rune(0), err
+ }
+ return r, nil
+}
+
+func (t *Tokenizer) read() (rune, error) {
+ r, _, err := t.input.ReadRune()
+ if err != nil {
+ return rune(0), err
+ }
+ return r, nil
+}
+
+func (t *Tokenizer) unread() error {
+ return t.input.UnreadRune()
+}
+
+func (t *Tokenizer) skipWhitespace() (rune, error) {
+ var r rune
+ var err error
+loop:
+ for {
+ r, err = t.read()
+ if err != nil {
+ break
+ }
+ switch {
+ case !unicode.IsSpace(r):
+ break loop
+ case r == '\n':
+ t.loc.Eol()
+ default:
+ t.loc.Inc()
+ }
+ }
+ return r, err
+}
+
+func (t *Tokenizer) extract(filter func(rune) bool) (string, error) {
+ var buf strings.Builder
+ for {
+ r, err := t.peek()
+ // EOF, error or end of extraction
+ if err == io.EOF || !filter(r) {
+ t.loc.Add(buf.Len())
+ return buf.String(), nil
+ } else if err != nil {
+ return "", err
+ } else {
+ t.read()
+ }
+ if _, err = buf.WriteRune(r); err != nil {
+ return "", err
+ }
+ }
+}
+
+func (t *Tokenizer) extractEscaped(end rune) (string, error) {
+ var buf strings.Builder
+ for {
+ r, err := t.read()
+ t.loc.Inc()
+ if r == end {
+ return buf.String(), nil
+ } else if err == io.EOF {
+ return "", fmt.Errorf("EOF found before closing '%c'", end)
+ } else if err != nil {
+ return "", err
+ } else {
+ if r == '\\' {
+ r, err = t.read()
+ if err == io.EOF {
+ return "", goerr.New("EOF found before completing escape sequence")
+ } else if err != nil {
+ return "", err
+ }
+ switch r {
+ case 'n':
+ r = '\n'
+ case 't':
+ r = '\t'
+ case '\\', end:
+ case 'x':
+ count := 0
+ value, err := t.extract(func(r rune) bool {
+ defer func() {
+ count++
+ }()
+ return unicode.In(r, unicode.ASCII_Hex_Digit) && count != 2
+ })
+ if err != nil {
+ return "", err
+ }
+ if len(value) != 2 {
+ return "", goerr.New("invalid escape sequence")
+ }
+ nval, err := strconv.ParseInt(value, 16, 8)
+ if err != nil {
+ return "", goerr.New("invalid escape sequence")
+ }
+ r = rune(byte(nval))
+ default:
+ return "", goerr.New("invalid escape sequence")
+ }
+ t.loc.Inc()
+ }
+ }
+ if _, err = buf.WriteRune(r); err != nil {
+ return "", err
+ }
+ }
+}
+
+func (t *Tokenizer) token() (tokens.Token, error) {
+ var tok tokens.Token
+ var r rune
+ var err error
+ var loc tokens.Location
+
+ for {
+ r, err = t.skipWhitespace()
+ loc = t.loc
+ if err == io.EOF {
+ return tokens.Token{Id: tokens.Eof, Loc: loc}, nil
+ } else if err != nil {
+ return tok, err
+ }
+
+ // comments
+ if r == '/' {
+ if p, err := t.peek(); err == nil && p == '/' {
+ for {
+ r, err = t.read()
+ if err != nil {
+ // could be EOF
+ break
+ }
+ if r == '\n' {
+ t.loc.Eol()
+ break
+ }
+ }
+ // check for whitespace
+ continue
+ }
+ }
+ // not a comment, skipped whitespace,
+ // now process this token
+ break
+ }
+
+ // identifier or keywords
+ if unicode.IsLetter(r) || r == '_' {
+ t.unread()
+ value, err := t.extract(func(r rune) bool {
+ return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_'
+ })
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ // match keywords
+ if tokenId, ok := keywords[value]; ok {
+ return tokens.Token{Id: tokenId, Loc: loc, Value: value}, nil
+ }
+ // otherwise is an identitier
+ return tokens.Token{Id: tokens.Ident, Loc: loc, Value: value}, nil
+ }
+
+ // character literal
+ if r == '\'' {
+ t.loc.Inc()
+ value, err := t.extractEscaped('\'')
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ if len(value) != 1 {
+ return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character literal")
+ }
+ return tokens.Token{Id: tokens.Char, Loc: loc, Value: value}, nil
+ }
+
+ // numeric literal
+ if unicode.IsDigit(r) {
+ t.unread()
+ pos := 0
+ hex := false
+ bin := false
+ value, err := t.extract(func(r rune) bool {
+ if pos == 1 && r == 'x' {
+ hex = true
+ pos++
+ return true
+ }
+ if pos == 1 && r == 'b' {
+ bin = true
+ pos++
+ return true
+ }
+ if hex && unicode.In(r, unicode.ASCII_Hex_Digit) {
+ pos++
+ return true
+ }
+ if bin && r != '0' && r != '1' {
+ return false
+ }
+ pos++
+ return unicode.IsDigit(r)
+ })
+ if (hex || bin) && len(value) == 2 {
+ err = fmt.Errorf("invalid numeric format '%s'", value)
+ }
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ return tokens.Token{Id: tokens.Number, Loc: loc, Value: value}, nil
+ }
+
+ // string literal
+ if r == '"' {
+ t.loc.Inc()
+ value, err := t.extractEscaped('"')
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ return tokens.Token{Id: tokens.String, Loc: loc, Value: value}, nil
+ }
+
+ // two character symbols
+ if d, ok := doubleChar[r]; ok {
+ if p, err := t.peek(); err == nil {
+ if second, ok := d.second[p]; ok {
+ t.read()
+ t.loc.Add(2)
+ // FIXME: value
+ return tokens.Token{Id: second, Loc: loc, Value: string(r) + string(p)}, nil
+ }
+ }
+ }
+
+ // single character symbols
+ if tokenId, ok := singleChar[r]; ok {
+ t.loc.Inc()
+ return tokens.Token{Id: tokenId, Loc: loc, Value: string(r)}, nil
+ }
+
+ // invalid token
+ return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character:", fmt.Sprintf("'%c'", r))
+}
+
+func (t *Tokenizer) Scan() ([]tokens.Token, error) {
+ var ts []tokens.Token
+
+ for {
+ token, err := t.token()
+ if err != nil {
+ return nil, err
+ }
+ ts = append(ts, token)
+ if token.Id == tokens.Eof {
+ return ts, nil
+ }
+ }
+}