aboutsummaryrefslogtreecommitdiff
path: root/tokenizer
diff options
context:
space:
mode:
authorJuan J. Martinez <jjm@usebox.net>2022-07-18 07:45:58 +0100
committerJuan J. Martinez <jjm@usebox.net>2022-07-18 07:45:58 +0100
commit8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642 (patch)
treec53977d1284347bb1d5963ddb4dc7723c40c6e55 /tokenizer
downloadmicro-lang-8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642.tar.gz
micro-lang-8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642.zip
First public release
Diffstat (limited to 'tokenizer')
-rw-r--r--tokenizer/tokenizer.go356
-rw-r--r--tokenizer/tokenizer_test.go310
2 files changed, 666 insertions, 0 deletions
diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go
new file mode 100644
index 0000000..78616c6
--- /dev/null
+++ b/tokenizer/tokenizer.go
@@ -0,0 +1,356 @@
+package tokenizer
+
+import (
+ "bufio"
+ goerr "errors"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+ "unicode"
+
+ "usebox.net/lang/errors"
+ "usebox.net/lang/tokens"
+)
+
+type rTokenIdMap map[rune]tokens.TokenId
+type sTokenIdMap map[string]tokens.TokenId
+
+var (
+ keywords = sTokenIdMap{
+ "true": tokens.True,
+ "false": tokens.False,
+ "var": tokens.Var,
+ "const": tokens.Const,
+ "def": tokens.Def,
+ "return": tokens.Return,
+ "number": tokens.TNumber,
+ "bool": tokens.TBool,
+ "string": tokens.TString,
+ "func": tokens.TFunc,
+ "if": tokens.If,
+ "else": tokens.Else,
+ "for": tokens.For,
+ "in": tokens.In,
+ "continue": tokens.Continue,
+ "break": tokens.Break,
+ }
+
+ doubleChar = map[rune]struct {
+ second rTokenIdMap
+ }{
+ '|': {rTokenIdMap{'|': tokens.Or}},
+ '&': {rTokenIdMap{'&': tokens.And}},
+ '=': {rTokenIdMap{'=': tokens.Eq}},
+ '!': {rTokenIdMap{'=': tokens.Ne, '?': tokens.TagE}},
+ '>': {rTokenIdMap{'=': tokens.Ge, '>': tokens.BitShr}},
+ '<': {rTokenIdMap{'=': tokens.Le, '<': tokens.BitShl}},
+ }
+
+ singleChar = rTokenIdMap{
+ '(': tokens.LParen,
+ ')': tokens.RParen,
+ '+': tokens.Add,
+ '-': tokens.Sub,
+ '*': tokens.Mul,
+ '%': tokens.Mod,
+ '/': tokens.Div,
+ '>': tokens.Gt,
+ '<': tokens.Lt,
+ '!': tokens.Not,
+ '~': tokens.Neg,
+ '|': tokens.BitOr,
+ '&': tokens.BitAnd,
+ '^': tokens.BitXor,
+ '=': tokens.Assign,
+ '.': tokens.Dot,
+ ';': tokens.Semicolon,
+ ',': tokens.Comma,
+ '{': tokens.LBrace,
+ '}': tokens.RBrace,
+ '[': tokens.LBracket,
+ ']': tokens.RBracket,
+ '?': tokens.TestE,
+ }
+)
+
+type Tokenizer struct {
+ input bufio.Reader
+
+ loc tokens.Location
+}
+
+func NewTokenizer(filename string, input io.Reader) *Tokenizer {
+ return &Tokenizer{input: *bufio.NewReader(input), loc: tokens.Location{File: filename, Line: 1, Column: 1}}
+}
+
+func (t *Tokenizer) peek() (rune, error) {
+ r, _, err := t.input.ReadRune()
+ if err != nil {
+ t.input.UnreadRune()
+ return rune(0), err
+ }
+ if err := t.input.UnreadRune(); err != nil {
+ return rune(0), err
+ }
+ return r, nil
+}
+
+func (t *Tokenizer) read() (rune, error) {
+ r, _, err := t.input.ReadRune()
+ if err != nil {
+ return rune(0), err
+ }
+ return r, nil
+}
+
+func (t *Tokenizer) unread() error {
+ return t.input.UnreadRune()
+}
+
+func (t *Tokenizer) skipWhitespace() (rune, error) {
+ var r rune
+ var err error
+loop:
+ for {
+ r, err = t.read()
+ if err != nil {
+ break
+ }
+ switch {
+ case !unicode.IsSpace(r):
+ break loop
+ case r == '\n':
+ t.loc.Eol()
+ default:
+ t.loc.Inc()
+ }
+ }
+ return r, err
+}
+
+func (t *Tokenizer) extract(filter func(rune) bool) (string, error) {
+ var buf strings.Builder
+ for {
+ r, err := t.peek()
+ // EOF, error or end of extraction
+ if err == io.EOF || !filter(r) {
+ t.loc.Add(buf.Len())
+ return buf.String(), nil
+ } else if err != nil {
+ return "", err
+ } else {
+ t.read()
+ }
+ if _, err = buf.WriteRune(r); err != nil {
+ return "", err
+ }
+ }
+}
+
+func (t *Tokenizer) extractEscaped(end rune) (string, error) {
+ var buf strings.Builder
+ for {
+ r, err := t.read()
+ t.loc.Inc()
+ if r == end {
+ return buf.String(), nil
+ } else if err == io.EOF {
+ return "", fmt.Errorf("EOF found before closing '%c'", end)
+ } else if err != nil {
+ return "", err
+ } else {
+ if r == '\\' {
+ r, err = t.read()
+ if err == io.EOF {
+ return "", goerr.New("EOF found before completing escape sequence")
+ } else if err != nil {
+ return "", err
+ }
+ switch r {
+ case 'n':
+ r = '\n'
+ case 't':
+ r = '\t'
+ case '\\', end:
+ case 'x':
+ count := 0
+ value, err := t.extract(func(r rune) bool {
+ defer func() {
+ count++
+ }()
+ return unicode.In(r, unicode.ASCII_Hex_Digit) && count != 2
+ })
+ if err != nil {
+ return "", err
+ }
+ if len(value) != 2 {
+ return "", goerr.New("invalid escape sequence")
+ }
+ nval, err := strconv.ParseInt(value, 16, 8)
+ if err != nil {
+ return "", goerr.New("invalid escape sequence")
+ }
+ r = rune(byte(nval))
+ default:
+ return "", goerr.New("invalid escape sequence")
+ }
+ t.loc.Inc()
+ }
+ }
+ if _, err = buf.WriteRune(r); err != nil {
+ return "", err
+ }
+ }
+}
+
+func (t *Tokenizer) token() (tokens.Token, error) {
+ var tok tokens.Token
+ var r rune
+ var err error
+ var loc tokens.Location
+
+ for {
+ r, err = t.skipWhitespace()
+ loc = t.loc
+ if err == io.EOF {
+ return tokens.Token{Id: tokens.Eof, Loc: loc}, nil
+ } else if err != nil {
+ return tok, err
+ }
+
+ // comments
+ if r == '/' {
+ if p, err := t.peek(); err == nil && p == '/' {
+ for {
+ r, err = t.read()
+ if err != nil {
+ // could be EOF
+ break
+ }
+ if r == '\n' {
+ t.loc.Eol()
+ break
+ }
+ }
+ // check for whitespace
+ continue
+ }
+ }
+ // not a comment, skipped whitespace,
+ // now process this token
+ break
+ }
+
+ // identifier or keywords
+ if unicode.IsLetter(r) || r == '_' {
+ t.unread()
+ value, err := t.extract(func(r rune) bool {
+ return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_'
+ })
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ // match keywords
+ if tokenId, ok := keywords[value]; ok {
+ return tokens.Token{Id: tokenId, Loc: loc, Value: value}, nil
+ }
+ // otherwise is an identitier
+ return tokens.Token{Id: tokens.Ident, Loc: loc, Value: value}, nil
+ }
+
+ // character literal
+ if r == '\'' {
+ t.loc.Inc()
+ value, err := t.extractEscaped('\'')
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ if len(value) != 1 {
+ return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character literal")
+ }
+ return tokens.Token{Id: tokens.Char, Loc: loc, Value: value}, nil
+ }
+
+ // numeric literal
+ if unicode.IsDigit(r) {
+ t.unread()
+ pos := 0
+ hex := false
+ bin := false
+ value, err := t.extract(func(r rune) bool {
+ if pos == 1 && r == 'x' {
+ hex = true
+ pos++
+ return true
+ }
+ if pos == 1 && r == 'b' {
+ bin = true
+ pos++
+ return true
+ }
+ if hex && unicode.In(r, unicode.ASCII_Hex_Digit) {
+ pos++
+ return true
+ }
+ if bin && r != '0' && r != '1' {
+ return false
+ }
+ pos++
+ return unicode.IsDigit(r)
+ })
+ if (hex || bin) && len(value) == 2 {
+ err = fmt.Errorf("invalid numeric format '%s'", value)
+ }
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ return tokens.Token{Id: tokens.Number, Loc: loc, Value: value}, nil
+ }
+
+ // string literal
+ if r == '"' {
+ t.loc.Inc()
+ value, err := t.extractEscaped('"')
+ if err != nil {
+ return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error())
+ }
+ return tokens.Token{Id: tokens.String, Loc: loc, Value: value}, nil
+ }
+
+ // two character symbols
+ if d, ok := doubleChar[r]; ok {
+ if p, err := t.peek(); err == nil {
+ if second, ok := d.second[p]; ok {
+ t.read()
+ t.loc.Add(2)
+ // FIXME: value
+ return tokens.Token{Id: second, Loc: loc, Value: string(r) + string(p)}, nil
+ }
+ }
+ }
+
+ // single character symbols
+ if tokenId, ok := singleChar[r]; ok {
+ t.loc.Inc()
+ return tokens.Token{Id: tokenId, Loc: loc, Value: string(r)}, nil
+ }
+
+ // invalid token
+ return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character:", fmt.Sprintf("'%c'", r))
+}
+
+func (t *Tokenizer) Scan() ([]tokens.Token, error) {
+ var ts []tokens.Token
+
+ for {
+ token, err := t.token()
+ if err != nil {
+ return nil, err
+ }
+ ts = append(ts, token)
+ if token.Id == tokens.Eof {
+ return ts, nil
+ }
+ }
+}
diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go
new file mode 100644
index 0000000..47afebb
--- /dev/null
+++ b/tokenizer/tokenizer_test.go
@@ -0,0 +1,310 @@
+package tokenizer
+
+import (
+ "strings"
+ "testing"
+
+ "usebox.net/lang/tokens"
+)
+
+func TestSkipWhitespace(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader(" \t\r\n"))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Eof {
+ t.Errorf("Eof expected, got %s", tok)
+ }
+ if tok.Loc.Line != 2 {
+ t.Errorf("line == 2 expected, got %s", tok.Loc)
+ }
+ if tok.Loc.Column != 1 {
+ t.Errorf("column == 1 expected, got %s", tok.Loc)
+ }
+}
+
+func TestComments(t *testing.T) {
+ for _, tt := range []struct {
+ name string
+ input string
+ line int
+ col int
+ }{
+ {"single comment", "// a comment\n", 2, 1},
+ {"ignore tokens", "// 1 + 2 is ignored\n", 2, 1},
+ {"skip whitespace", " // a comment\n", 2, 1},
+ {"comment to eof", "\n// comment to eof", 2, 1},
+ {"multiple comments", "// comment\n// another comment", 2, 1},
+ {"whitespace before comment", "\t// comment with whitespace\n\t// comment\n", 3, 1},
+ {"unicode", "// こんにちは\n", 2, 1},
+ } {
+ t.Run(tt.name, func(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader(tt.input))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Eof {
+ t.Errorf("Eof expected, got %s", tok)
+ }
+ if tok.Loc.Line != tt.line {
+ t.Errorf("line == %d expected, got %s", tt.line, tok.Loc)
+ }
+ if tok.Loc.Column != tt.col {
+ t.Errorf("column == %d expected, got %s", tt.col, tok.Loc)
+ }
+ })
+ }
+}
+
+func TestIdent(t *testing.T) {
+ for _, tt := range []string{
+ "ident",
+ "MyIdent",
+ "ident2",
+ "名前",
+ "__add",
+ } {
+ t.Run(tt, func(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader(tt))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Ident {
+ t.Errorf("Ident expected, got %s", tok)
+ }
+ if tok.Value != tt {
+ t.Errorf("value == %s expected, got %s", tt, tok.Value)
+ }
+ })
+ }
+}
+
+func TestMultipleCalls(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader("// comment\nident // with a comment\nanother\n"))
+ for _, tt := range []string{
+ "ident", "another",
+ } {
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Ident {
+ t.Errorf("Ident expected, got %s", tok)
+ }
+ if tok.Value != tt {
+ t.Errorf("value == %s expected, got %s", tt, tok.Value)
+ }
+ }
+}
+
+func TestKeywords(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader("true false var const def return number bool string func if else for in continue break"))
+ for _, tt := range []tokens.TokenId{
+ tokens.True,
+ tokens.False,
+ tokens.Var,
+ tokens.Const,
+ tokens.Def,
+ tokens.Return,
+ tokens.TNumber,
+ tokens.TBool,
+ tokens.TString,
+ tokens.TFunc,
+ tokens.If,
+ tokens.Else,
+ tokens.For,
+ tokens.In,
+ tokens.Continue,
+ tokens.Break,
+ } {
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tt {
+ t.Errorf("%s expected, got %s", tt, tok)
+ }
+ }
+}
+
+func TestNumber(t *testing.T) {
+ for _, tt := range []string{
+ "1234",
+ "0x4d2",
+ "0b10011010010",
+ } {
+ tzr := NewTokenizer("-", strings.NewReader(tt))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Number {
+ t.Errorf("Number expected, got %s", tok)
+ }
+ if tok.Value != tt {
+ t.Errorf("value == %s expected, got '%s'", tt, tok.Value)
+ }
+ }
+}
+
+func TestCharacter(t *testing.T) {
+ for tt, e := range map[string]string{
+ "'a'": "a",
+ "'0'": "0",
+ "'\\''": "'",
+ "' '": " ",
+ "'\\n'": "\n",
+ "'\\x0d'": "\r",
+ } {
+ tzr := NewTokenizer("-", strings.NewReader(tt))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.Char {
+ t.Errorf("Number expected, got %s", tok)
+ }
+ if tok.Value != e {
+ t.Errorf("value == %s expected, got '%s'", e, tok.Value)
+ }
+ }
+}
+
+func TestErrorCharacter(t *testing.T) {
+ for _, tt := range []string{
+ "'12'",
+ "''",
+ "'\\'",
+ "'A",
+ "'世'",
+ "'\\x0'",
+ } {
+ tzr := NewTokenizer("-", strings.NewReader(tt))
+ _, err := tzr.token()
+ if err == nil {
+ t.Errorf("expected error, didn't happen (input: %s)", tt)
+ }
+ }
+}
+
+func TestString(t *testing.T) {
+ for tt, v := range map[string]string{
+ "\"this is a string\"": "this is a string",
+ "\"0.1234\"": "0.1234",
+ "\"\\\"escaped\\\" string\"": "\"escaped\" string",
+ "\"\\n\\x0d\\t\"": "\n\r\t",
+ "\"Multiline\\nstring\"": "Multiline\nstring",
+ } {
+ t.Run(tt, func(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader(tt))
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tokens.String {
+ t.Errorf("String expected, got \"%s\"", tok)
+ }
+ if tok.Value != v {
+ t.Errorf("value == %s expected, got \"%s\"", tt, tok.Value)
+ }
+ })
+ }
+}
+
+func TestErrorStrnig(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader("\"string and EOF"))
+ _, err := tzr.token()
+ if err == nil {
+ t.Errorf("expected error, didn't happen")
+ }
+}
+
+func TestSingleChar(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader("{ } ( ) [ ] ; , + - * % / . = > < ! ~ | & ^ ?"))
+ for _, tt := range []tokens.TokenId{
+ tokens.LBrace,
+ tokens.RBrace,
+ tokens.LParen,
+ tokens.RParen,
+ tokens.LBracket,
+ tokens.RBracket,
+ tokens.Semicolon,
+ tokens.Comma,
+ tokens.Add,
+ tokens.Sub,
+ tokens.Mul,
+ tokens.Mod,
+ tokens.Div,
+ tokens.Dot,
+ tokens.Assign,
+ tokens.Gt,
+ tokens.Lt,
+ tokens.Not,
+ tokens.Neg,
+ tokens.BitOr,
+ tokens.BitAnd,
+ tokens.BitXor,
+ tokens.TestE,
+ } {
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tt {
+ t.Errorf("%s expected, got %s", tt, tok)
+ }
+ }
+}
+
+func TestDoubleChar(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader("|| && == != >= <= >> << !?"))
+ for _, tt := range []tokens.TokenId{
+ tokens.Or,
+ tokens.And,
+ tokens.Eq,
+ tokens.Ne,
+ tokens.Ge,
+ tokens.Le,
+ tokens.BitShr,
+ tokens.BitShl,
+ tokens.TagE,
+ } {
+ tok, err := tzr.token()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if tok.Id != tt {
+ t.Errorf("%s expected, got %s", tt, tok)
+ }
+ }
+}
+
+func TestScan(t *testing.T) {
+ for _, tt := range []struct {
+ name string
+ input string
+ ntokens int
+ }{
+ {"single line", "1 + 2", 4},
+ {"multiple lines", "1 + 2\nident", 5},
+ {"line starts with whitespace", "1 + 2\n\tident", 5},
+ } {
+ t.Run(tt.name, func(t *testing.T) {
+ tzr := NewTokenizer("-", strings.NewReader(tt.input))
+ ts, err := tzr.Scan()
+ if err != nil {
+ t.Errorf("unexpected error %s", err)
+ }
+ if len(ts) != tt.ntokens {
+ t.Errorf("%d tokens expected, got %d", tt.ntokens, len(ts))
+ }
+ last := ts[len(ts)-1]
+ if last.Id != tokens.Eof {
+ t.Errorf("last token expected to be Eof, got %s", last)
+ }
+ })
+ }
+}