From 8bb321f8b032dfaeffbe3d1b8dfeb215c12d3642 Mon Sep 17 00:00:00 2001 From: "Juan J. Martinez" Date: Mon, 18 Jul 2022 07:45:58 +0100 Subject: First public release --- tokenizer/tokenizer.go | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 tokenizer/tokenizer.go (limited to 'tokenizer/tokenizer.go') diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go new file mode 100644 index 0000000..78616c6 --- /dev/null +++ b/tokenizer/tokenizer.go @@ -0,0 +1,356 @@ +package tokenizer + +import ( + "bufio" + goerr "errors" + "fmt" + "io" + "strconv" + "strings" + "unicode" + + "usebox.net/lang/errors" + "usebox.net/lang/tokens" +) + +type rTokenIdMap map[rune]tokens.TokenId +type sTokenIdMap map[string]tokens.TokenId + +var ( + keywords = sTokenIdMap{ + "true": tokens.True, + "false": tokens.False, + "var": tokens.Var, + "const": tokens.Const, + "def": tokens.Def, + "return": tokens.Return, + "number": tokens.TNumber, + "bool": tokens.TBool, + "string": tokens.TString, + "func": tokens.TFunc, + "if": tokens.If, + "else": tokens.Else, + "for": tokens.For, + "in": tokens.In, + "continue": tokens.Continue, + "break": tokens.Break, + } + + doubleChar = map[rune]struct { + second rTokenIdMap + }{ + '|': {rTokenIdMap{'|': tokens.Or}}, + '&': {rTokenIdMap{'&': tokens.And}}, + '=': {rTokenIdMap{'=': tokens.Eq}}, + '!': {rTokenIdMap{'=': tokens.Ne, '?': tokens.TagE}}, + '>': {rTokenIdMap{'=': tokens.Ge, '>': tokens.BitShr}}, + '<': {rTokenIdMap{'=': tokens.Le, '<': tokens.BitShl}}, + } + + singleChar = rTokenIdMap{ + '(': tokens.LParen, + ')': tokens.RParen, + '+': tokens.Add, + '-': tokens.Sub, + '*': tokens.Mul, + '%': tokens.Mod, + '/': tokens.Div, + '>': tokens.Gt, + '<': tokens.Lt, + '!': tokens.Not, + '~': tokens.Neg, + '|': tokens.BitOr, + '&': tokens.BitAnd, + '^': tokens.BitXor, + '=': tokens.Assign, + '.': tokens.Dot, + ';': tokens.Semicolon, + ',': tokens.Comma, + '{': tokens.LBrace, + '}': tokens.RBrace, + '[': tokens.LBracket, + ']': tokens.RBracket, + '?': tokens.TestE, + } +) + +type Tokenizer struct { + input bufio.Reader + + loc tokens.Location +} + +func NewTokenizer(filename string, input io.Reader) *Tokenizer { + return &Tokenizer{input: *bufio.NewReader(input), loc: tokens.Location{File: filename, Line: 1, Column: 1}} +} + +func (t *Tokenizer) peek() (rune, error) { + r, _, err := t.input.ReadRune() + if err != nil { + t.input.UnreadRune() + return rune(0), err + } + if err := t.input.UnreadRune(); err != nil { + return rune(0), err + } + return r, nil +} + +func (t *Tokenizer) read() (rune, error) { + r, _, err := t.input.ReadRune() + if err != nil { + return rune(0), err + } + return r, nil +} + +func (t *Tokenizer) unread() error { + return t.input.UnreadRune() +} + +func (t *Tokenizer) skipWhitespace() (rune, error) { + var r rune + var err error +loop: + for { + r, err = t.read() + if err != nil { + break + } + switch { + case !unicode.IsSpace(r): + break loop + case r == '\n': + t.loc.Eol() + default: + t.loc.Inc() + } + } + return r, err +} + +func (t *Tokenizer) extract(filter func(rune) bool) (string, error) { + var buf strings.Builder + for { + r, err := t.peek() + // EOF, error or end of extraction + if err == io.EOF || !filter(r) { + t.loc.Add(buf.Len()) + return buf.String(), nil + } else if err != nil { + return "", err + } else { + t.read() + } + if _, err = buf.WriteRune(r); err != nil { + return "", err + } + } +} + +func (t *Tokenizer) extractEscaped(end rune) (string, error) { + var buf strings.Builder + for { + r, err := t.read() + t.loc.Inc() + if r == end { + return buf.String(), nil + } else if err == io.EOF { + return "", fmt.Errorf("EOF found before closing '%c'", end) + } else if err != nil { + return "", err + } else { + if r == '\\' { + r, err = t.read() + if err == io.EOF { + return "", goerr.New("EOF found before completing escape sequence") + } else if err != nil { + return "", err + } + switch r { + case 'n': + r = '\n' + case 't': + r = '\t' + case '\\', end: + case 'x': + count := 0 + value, err := t.extract(func(r rune) bool { + defer func() { + count++ + }() + return unicode.In(r, unicode.ASCII_Hex_Digit) && count != 2 + }) + if err != nil { + return "", err + } + if len(value) != 2 { + return "", goerr.New("invalid escape sequence") + } + nval, err := strconv.ParseInt(value, 16, 8) + if err != nil { + return "", goerr.New("invalid escape sequence") + } + r = rune(byte(nval)) + default: + return "", goerr.New("invalid escape sequence") + } + t.loc.Inc() + } + } + if _, err = buf.WriteRune(r); err != nil { + return "", err + } + } +} + +func (t *Tokenizer) token() (tokens.Token, error) { + var tok tokens.Token + var r rune + var err error + var loc tokens.Location + + for { + r, err = t.skipWhitespace() + loc = t.loc + if err == io.EOF { + return tokens.Token{Id: tokens.Eof, Loc: loc}, nil + } else if err != nil { + return tok, err + } + + // comments + if r == '/' { + if p, err := t.peek(); err == nil && p == '/' { + for { + r, err = t.read() + if err != nil { + // could be EOF + break + } + if r == '\n' { + t.loc.Eol() + break + } + } + // check for whitespace + continue + } + } + // not a comment, skipped whitespace, + // now process this token + break + } + + // identifier or keywords + if unicode.IsLetter(r) || r == '_' { + t.unread() + value, err := t.extract(func(r rune) bool { + return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' + }) + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + // match keywords + if tokenId, ok := keywords[value]; ok { + return tokens.Token{Id: tokenId, Loc: loc, Value: value}, nil + } + // otherwise is an identitier + return tokens.Token{Id: tokens.Ident, Loc: loc, Value: value}, nil + } + + // character literal + if r == '\'' { + t.loc.Inc() + value, err := t.extractEscaped('\'') + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + if len(value) != 1 { + return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character literal") + } + return tokens.Token{Id: tokens.Char, Loc: loc, Value: value}, nil + } + + // numeric literal + if unicode.IsDigit(r) { + t.unread() + pos := 0 + hex := false + bin := false + value, err := t.extract(func(r rune) bool { + if pos == 1 && r == 'x' { + hex = true + pos++ + return true + } + if pos == 1 && r == 'b' { + bin = true + pos++ + return true + } + if hex && unicode.In(r, unicode.ASCII_Hex_Digit) { + pos++ + return true + } + if bin && r != '0' && r != '1' { + return false + } + pos++ + return unicode.IsDigit(r) + }) + if (hex || bin) && len(value) == 2 { + err = fmt.Errorf("invalid numeric format '%s'", value) + } + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + return tokens.Token{Id: tokens.Number, Loc: loc, Value: value}, nil + } + + // string literal + if r == '"' { + t.loc.Inc() + value, err := t.extractEscaped('"') + if err != nil { + return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) + } + return tokens.Token{Id: tokens.String, Loc: loc, Value: value}, nil + } + + // two character symbols + if d, ok := doubleChar[r]; ok { + if p, err := t.peek(); err == nil { + if second, ok := d.second[p]; ok { + t.read() + t.loc.Add(2) + // FIXME: value + return tokens.Token{Id: second, Loc: loc, Value: string(r) + string(p)}, nil + } + } + } + + // single character symbols + if tokenId, ok := singleChar[r]; ok { + t.loc.Inc() + return tokens.Token{Id: tokenId, Loc: loc, Value: string(r)}, nil + } + + // invalid token + return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character:", fmt.Sprintf("'%c'", r)) +} + +func (t *Tokenizer) Scan() ([]tokens.Token, error) { + var ts []tokens.Token + + for { + token, err := t.token() + if err != nil { + return nil, err + } + ts = append(ts, token) + if token.Id == tokens.Eof { + return ts, nil + } + } +} -- cgit v1.2.3