From 8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642 Mon Sep 17 00:00:00 2001 From: "Juan J. Martinez" Date: Mon, 18 Jul 2022 07:45:58 +0100 Subject: First public release --- tokenizer/tokenizer.go | 356 ++++++++++++++++++++++++++++++++++++++++++++ tokenizer/tokenizer_test.go | 310 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 666 insertions(+) create mode 100644 tokenizer/tokenizer.go create mode 100644 tokenizer/tokenizer_test.go (limited to 'tokenizer') diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go new file mode 100644 index 0000000..78616c6 --- /dev/null +++ b/tokenizer/tokenizer.go @@ -0,0 +1,356 @@ +package tokenizer + +import ( + "bufio" + goerr "errors" + "fmt" + "io" + "strconv" + "strings" + "unicode" + + "usebox.net/lang/errors" + "usebox.net/lang/tokens" +) + +type rTokenIdMap map[rune]tokens.TokenId +type sTokenIdMap map[string]tokens.TokenId + +var ( + keywords = sTokenIdMap{ + "true": tokens.True, + "false": tokens.False, + "var": tokens.Var, + "const": tokens.Const, + "def": tokens.Def, + "return": tokens.Return, + "number": tokens.TNumber, + "bool": tokens.TBool, + "string": tokens.TString, + "func": tokens.TFunc, + "if": tokens.If, + "else": tokens.Else, + "for": tokens.For, + "in": tokens.In, + "continue": tokens.Continue, + "break": tokens.Break, + } + + doubleChar = map[rune]struct { + second rTokenIdMap + }{ + '|': {rTokenIdMap{'|': tokens.Or}}, + '&': {rTokenIdMap{'&': tokens.And}}, + '=': {rTokenIdMap{'=': tokens.Eq}}, + '!': {rTokenIdMap{'=': tokens.Ne, '?': tokens.TagE}}, + '>': {rTokenIdMap{'=': tokens.Ge, '>': tokens.BitShr}}, + '<': {rTokenIdMap{'=': tokens.Le, '<': tokens.BitShl}}, + } + + singleChar = rTokenIdMap{ + '(': tokens.LParen, + ')': tokens.RParen, + '+': tokens.Add, + '-': tokens.Sub, + '*': tokens.Mul, + '%': tokens.Mod, + '/': tokens.Div, + '>': tokens.Gt, + '<': tokens.Lt, + '!': tokens.Not, + '~': tokens.Neg, + '|': tokens.BitOr, + '&': tokens.BitAnd, + '^': tokens.BitXor, + '=': tokens.Assign, + '.': tokens.Dot, + ';': tokens.Semicolon, + ',': tokens.Comma, + '{': tokens.LBrace, + '}': tokens.RBrace, + '[': tokens.LBracket, + ']': tokens.RBracket, + '?': tokens.TestE, + } +) + +type Tokenizer struct { + input bufio.Reader + + loc tokens.Location +} + +func NewTokenizer(filename string, input io.Reader) *Tokenizer { + return &Tokenizer{input: *bufio.NewReader(input), loc: tokens.Location{File: filename, Line: 1, Column: 1}} +} + +func (t *Tokenizer) peek() (rune, error) { + r, _, err := t.input.ReadRune() + if err != nil { + t.input.UnreadRune() + return rune(0), err + } + if err := t.input.UnreadRune(); err != nil { + return rune(0), err + } + return r, nil +} + +func (t *Tokenizer) read() (rune, error) { + r, _, err := t.input.ReadRune() + if err != nil { + return rune(0), err + } + return r, nil +} + +func (t *Tokenizer) unread() error { + return t.input.UnreadRune() +} + +func (t *Tokenizer) skipWhitespace() (rune, error) { + var r rune + var err error +loop: + for { + r, err = t.read() + if err != nil { + break + } + switch { + case !unicode.IsSpace(r): + break loop + case r == '\n': + t.loc.Eol() + default: + t.loc.Inc() + } + } + return r, err +} + +func (t *Tokenizer) extract(filter func(rune) bool) (string, error) { + var buf strings.Builder + for { + r, err := t.peek() + // EOF, error or end of extraction + if err == io.EOF || !filter(r) { + t.loc.Add(buf.Len()) + return buf.String(), nil + } else if err != nil { + return "", err + } else { + t.read() + } + if _, err = buf.WriteRune(r); err != nil { + return "", err + } + } +} + +func (t *Tokenizer) extractEscaped(end rune) (string, error) { + var buf strings.Builder + for { + r, err := t.read() + t.loc.Inc() + if r == end { + return buf.String(), nil + } else if err == io.EOF { + return "", fmt.Errorf("EOF found before closing '%c'", end) + } else if err != nil { + return "", err + } else { + if r == '\\' { + r, err = t.read() + if err == io.EOF { + return "", goerr.New("EOF found before completing escape sequence") + } else if err != nil { + return "", err + } + switch r { + case 'n': + r = '\n' + case 't': + r = '\t' + case '\\', end: + case 'x': + count := 0 + value, err := t.extract(func(r rune) bool { + defer func() { + count++ + }() + return unicode.In(r, unicode.ASCII_Hex_Digit) && count != 2 + }) + if err != nil { + return "", err + } + if len(value) != 2 { + return "", goerr.New("invalid escape sequence") + } + nval, err := strconv.ParseInt(value, 16, 8) + if err != nil { + return "", goerr.New("invalid escape sequence") + } + r = rune(byte(nval)) + default: + return "", goerr.New("invalid escape sequence") + } + t.loc.Inc() + } + } + if _, err = buf.WriteRune(r); err != nil { + return "", err + } + } +} + +func (t *Tokenizer) token() (tokens.Token, error) { + var tok tokens.Token + var r rune + var err error + var loc tokens.Location + + for { + r, err = t.skipWhitespace() + loc = t.loc + if err == io.EOF { + return tokens.Token{Id: tokens.Eof, Loc: loc}, nil + } else if err != nil { + return tok, err + } + + // comments + if r == '/' { + if p, err := t.peek(); err == nil && p == '/' { + for { + r, err = t.read() + if err != nil { + // could be EOF + break + } + if r == '\n' { + t.loc.Eol() + break + } + } + // check for whitespace + continue + } + } + // not a comment, skipped whitespace, + // now process this token + break + } + + // identifier or keywords + if unicode.IsLetter(r) || r == '_' { + t.unread() + value, err := t.extract(func(r rune) bool { + return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' + }) + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + // match keywords + if tokenId, ok := keywords[value]; ok { + return tokens.Token{Id: tokenId, Loc: loc, Value: value}, nil + } + // otherwise is an identitier + return tokens.Token{Id: tokens.Ident, Loc: loc, Value: value}, nil + } + + // character literal + if r == '\'' { + t.loc.Inc() + value, err := t.extractEscaped('\'') + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + if len(value) != 1 { + return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character literal") + } + return tokens.Token{Id: tokens.Char, Loc: loc, Value: value}, nil + } + + // numeric literal + if unicode.IsDigit(r) { + t.unread() + pos := 0 + hex := false + bin := false + value, err := t.extract(func(r rune) bool { + if pos == 1 && r == 'x' { + hex = true + pos++ + return true + } + if pos == 1 && r == 'b' { + bin = true + pos++ + return true + } + if hex && unicode.In(r, unicode.ASCII_Hex_Digit) { + pos++ + return true + } + if bin && r != '0' && r != '1' { + return false + } + pos++ + return unicode.IsDigit(r) + }) + if (hex || bin) && len(value) == 2 { + err = fmt.Errorf("invalid numeric format '%s'", value) + } + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + return tokens.Token{Id: tokens.Number, Loc: loc, Value: value}, nil + } + + // string literal + if r == '"' { + t.loc.Inc() + value, err := t.extractEscaped('"') + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + return tokens.Token{Id: tokens.String, Loc: loc, Value: value}, nil + } + + // two character symbols + if d, ok := doubleChar[r]; ok { + if p, err := t.peek(); err == nil { + if second, ok := d.second[p]; ok { + t.read() + t.loc.Add(2) + // FIXME: value + return tokens.Token{Id: second, Loc: loc, Value: string(r) + string(p)}, nil + } + } + } + + // single character symbols + if tokenId, ok := singleChar[r]; ok { + t.loc.Inc() + return tokens.Token{Id: tokenId, Loc: loc, Value: string(r)}, nil + } + + // invalid token + return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character:", fmt.Sprintf("'%c'", r)) +} + +func (t *Tokenizer) Scan() ([]tokens.Token, error) { + var ts []tokens.Token + + for { + token, err := t.token() + if err != nil { + return nil, err + } + ts = append(ts, token) + if token.Id == tokens.Eof { + return ts, nil + } + } +} diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go new file mode 100644 index 0000000..47afebb --- /dev/null +++ b/tokenizer/tokenizer_test.go @@ -0,0 +1,310 @@ +package tokenizer + +import ( + "strings" + "testing" + + "usebox.net/lang/tokens" +) + +func TestSkipWhitespace(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader(" \t\r\n")) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Eof { + t.Errorf("Eof expected, got %s", tok) + } + if tok.Loc.Line != 2 { + t.Errorf("line == 2 expected, got %s", tok.Loc) + } + if tok.Loc.Column != 1 { + t.Errorf("column == 1 expected, got %s", tok.Loc) + } +} + +func TestComments(t *testing.T) { + for _, tt := range []struct { + name string + input string + line int + col int + }{ + {"single comment", "// a comment\n", 2, 1}, + {"ignore tokens", "// 1 + 2 is ignored\n", 2, 1}, + {"skip whitespace", " // a comment\n", 2, 1}, + {"comment to eof", "\n// comment to eof", 2, 1}, + {"multiple comments", "// comment\n// another comment", 2, 1}, + {"whitespace before comment", "\t// comment with whitespace\n\t// comment\n", 3, 1}, + {"unicode", "// こんにちは\n", 2, 1}, + } { + t.Run(tt.name, func(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader(tt.input)) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Eof { + t.Errorf("Eof expected, got %s", tok) + } + if tok.Loc.Line != tt.line { + t.Errorf("line == %d expected, got %s", tt.line, tok.Loc) + } + if tok.Loc.Column != tt.col { + t.Errorf("column == %d expected, got %s", tt.col, tok.Loc) + } + }) + } +} + +func TestIdent(t *testing.T) { + for _, tt := range []string{ + "ident", + "MyIdent", + "ident2", + "名前", + "__add", + } { + t.Run(tt, func(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader(tt)) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Ident { + t.Errorf("Ident expected, got %s", tok) + } + if tok.Value != tt { + t.Errorf("value == %s expected, got %s", tt, tok.Value) + } + }) + } +} + +func TestMultipleCalls(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader("// comment\nident // with a comment\nanother\n")) + for _, tt := range []string{ + "ident", "another", + } { + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Ident { + t.Errorf("Ident expected, got %s", tok) + } + if tok.Value != tt { + t.Errorf("value == %s expected, got %s", tt, tok.Value) + } + } +} + +func TestKeywords(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader("true false var const def return number bool string func if else for in continue break")) + for _, tt := range []tokens.TokenId{ + tokens.True, + tokens.False, + tokens.Var, + tokens.Const, + tokens.Def, + tokens.Return, + tokens.TNumber, + tokens.TBool, + tokens.TString, + tokens.TFunc, + tokens.If, + tokens.Else, + tokens.For, + tokens.In, + tokens.Continue, + tokens.Break, + } { + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tt { + t.Errorf("%s expected, got %s", tt, tok) + } + } +} + +func TestNumber(t *testing.T) { + for _, tt := range []string{ + "1234", + "0x4d2", + "0b10011010010", + } { + tzr := NewTokenizer("-", strings.NewReader(tt)) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Number { + t.Errorf("Number expected, got %s", tok) + } + if tok.Value != tt { + t.Errorf("value == %s expected, got '%s'", tt, tok.Value) + } + } +} + +func TestCharacter(t *testing.T) { + for tt, e := range map[string]string{ + "'a'": "a", + "'0'": "0", + "'\\''": "'", + "' '": " ", + "'\\n'": "\n", + "'\\x0d'": "\r", + } { + tzr := NewTokenizer("-", strings.NewReader(tt)) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.Char { + t.Errorf("Number expected, got %s", tok) + } + if tok.Value != e { + t.Errorf("value == %s expected, got '%s'", e, tok.Value) + } + } +} + +func TestErrorCharacter(t *testing.T) { + for _, tt := range []string{ + "'12'", + "''", + "'\\'", + "'A", + "'世'", + "'\\x0'", + } { + tzr := NewTokenizer("-", strings.NewReader(tt)) + _, err := tzr.token() + if err == nil { + t.Errorf("expected error, didn't happen (input: %s)", tt) + } + } +} + +func TestString(t *testing.T) { + for tt, v := range map[string]string{ + "\"this is a string\"": "this is a string", + "\"0.1234\"": "0.1234", + "\"\\\"escaped\\\" string\"": "\"escaped\" string", + "\"\\n\\x0d\\t\"": "\n\r\t", + "\"Multiline\\nstring\"": "Multiline\nstring", + } { + t.Run(tt, func(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader(tt)) + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tokens.String { + t.Errorf("String expected, got \"%s\"", tok) + } + if tok.Value != v { + t.Errorf("value == %s expected, got \"%s\"", tt, tok.Value) + } + }) + } +} + +func TestErrorStrnig(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader("\"string and EOF")) + _, err := tzr.token() + if err == nil { + t.Errorf("expected error, didn't happen") + } +} + +func TestSingleChar(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader("{ } ( ) [ ] ; , + - * % / . = > < ! ~ | & ^ ?")) + for _, tt := range []tokens.TokenId{ + tokens.LBrace, + tokens.RBrace, + tokens.LParen, + tokens.RParen, + tokens.LBracket, + tokens.RBracket, + tokens.Semicolon, + tokens.Comma, + tokens.Add, + tokens.Sub, + tokens.Mul, + tokens.Mod, + tokens.Div, + tokens.Dot, + tokens.Assign, + tokens.Gt, + tokens.Lt, + tokens.Not, + tokens.Neg, + tokens.BitOr, + tokens.BitAnd, + tokens.BitXor, + tokens.TestE, + } { + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tt { + t.Errorf("%s expected, got %s", tt, tok) + } + } +} + +func TestDoubleChar(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader("|| && == != >= <= >> << !?")) + for _, tt := range []tokens.TokenId{ + tokens.Or, + tokens.And, + tokens.Eq, + tokens.Ne, + tokens.Ge, + tokens.Le, + tokens.BitShr, + tokens.BitShl, + tokens.TagE, + } { + tok, err := tzr.token() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if tok.Id != tt { + t.Errorf("%s expected, got %s", tt, tok) + } + } +} + +func TestScan(t *testing.T) { + for _, tt := range []struct { + name string + input string + ntokens int + }{ + {"single line", "1 + 2", 4}, + {"multiple lines", "1 + 2\nident", 5}, + {"line starts with whitespace", "1 + 2\n\tident", 5}, + } { + t.Run(tt.name, func(t *testing.T) { + tzr := NewTokenizer("-", strings.NewReader(tt.input)) + ts, err := tzr.Scan() + if err != nil { + t.Errorf("unexpected error %s", err) + } + if len(ts) != tt.ntokens { + t.Errorf("%d tokens expected, got %d", tt.ntokens, len(ts)) + } + last := ts[len(ts)-1] + if last.Id != tokens.Eof { + t.Errorf("last token expected to be Eof, got %s", last) + } + }) + } +} -- cgit v1.2.3