package tokenizer import ( "bufio" goerr "errors" "fmt" "io" "strconv" "strings" "unicode" "usebox.net/micro-lang/errors" "usebox.net/micro-lang/tokens" ) type rTokenIdMap map[rune]tokens.TokenId type sTokenIdMap map[string]tokens.TokenId var ( keywords = sTokenIdMap{ "true": tokens.True, "false": tokens.False, "var": tokens.Var, "const": tokens.Const, "def": tokens.Def, "return": tokens.Return, "number": tokens.TNumber, "bool": tokens.TBool, "string": tokens.TString, "func": tokens.TFunc, "if": tokens.If, "else": tokens.Else, "for": tokens.For, "in": tokens.In, "continue": tokens.Continue, "break": tokens.Break, } doubleChar = map[rune]struct { second rTokenIdMap }{ '|': {rTokenIdMap{'|': tokens.Or}}, '&': {rTokenIdMap{'&': tokens.And}}, '=': {rTokenIdMap{'=': tokens.Eq}}, '!': {rTokenIdMap{'=': tokens.Ne, '?': tokens.TagE}}, '>': {rTokenIdMap{'=': tokens.Ge, '>': tokens.BitShr}}, '<': {rTokenIdMap{'=': tokens.Le, '<': tokens.BitShl}}, } singleChar = rTokenIdMap{ '(': tokens.LParen, ')': tokens.RParen, '+': tokens.Add, '-': tokens.Sub, '*': tokens.Mul, '%': tokens.Mod, '/': tokens.Div, '>': tokens.Gt, '<': tokens.Lt, '!': tokens.Not, '~': tokens.Neg, '|': tokens.BitOr, '&': tokens.BitAnd, '^': tokens.BitXor, '=': tokens.Assign, '.': tokens.Dot, ';': tokens.Semicolon, ',': tokens.Comma, '{': tokens.LBrace, '}': tokens.RBrace, '[': tokens.LBracket, ']': tokens.RBracket, '?': tokens.TestE, } ) type Tokenizer struct { input bufio.Reader loc tokens.Location } func NewTokenizer(filename string, input io.Reader) *Tokenizer { return &Tokenizer{input: *bufio.NewReader(input), loc: tokens.Location{File: filename, Line: 1, Column: 1}} } func (t *Tokenizer) peek() (rune, error) { r, _, err := t.input.ReadRune() if err != nil { t.input.UnreadRune() return rune(0), err } if err := t.input.UnreadRune(); err != nil { return rune(0), err } return r, nil } func (t *Tokenizer) read() (rune, error) { r, _, err := t.input.ReadRune() if err != nil { return rune(0), err } return r, nil } func (t *Tokenizer) unread() error { return t.input.UnreadRune() } func (t *Tokenizer) skipWhitespace() (rune, error) { var r rune var err error loop: for { r, err = t.read() if err != nil { break } switch { case !unicode.IsSpace(r): break loop case r == '\n': t.loc.Eol() default: t.loc.Inc() } } return r, err } func (t *Tokenizer) extract(filter func(rune) bool) (string, error) { var buf strings.Builder for { r, err := t.peek() // EOF, error or end of extraction if err == io.EOF || !filter(r) { t.loc.Add(buf.Len()) return buf.String(), nil } else if err != nil { return "", err } else { t.read() } if _, err = buf.WriteRune(r); err != nil { return "", err } } } func (t *Tokenizer) extractEscaped(end rune) (string, error) { var buf strings.Builder for { r, err := t.read() t.loc.Inc() if r == end { return buf.String(), nil } else if err == io.EOF { return "", fmt.Errorf("EOF found before closing '%c'", end) } else if err != nil { return "", err } else { if r == '\\' { r, err = t.read() if err == io.EOF { return "", goerr.New("EOF found before completing escape sequence") } else if err != nil { return "", err } switch r { case 'n': r = '\n' case 't': r = '\t' case '\\', end: case 'x': count := 0 value, err := t.extract(func(r rune) bool { defer func() { count++ }() return unicode.In(r, unicode.ASCII_Hex_Digit) && count != 2 }) if err != nil { return "", err } if len(value) != 2 { return "", goerr.New("invalid escape sequence") } nval, err := strconv.ParseInt(value, 16, 8) if err != nil { return "", goerr.New("invalid escape sequence") } r = rune(byte(nval)) default: return "", goerr.New("invalid escape sequence") } t.loc.Inc() } } if _, err = buf.WriteRune(r); err != nil { return "", err } } } func (t *Tokenizer) token() (tokens.Token, error) { var tok tokens.Token var r rune var err error var loc tokens.Location for { r, err = t.skipWhitespace() loc = t.loc if err == io.EOF { return tokens.Token{Id: tokens.Eof, Loc: loc}, nil } else if err != nil { return tok, err } // comments if r == '/' { if p, err := t.peek(); err == nil && p == '/' { for { r, err = t.read() if err != nil { // could be EOF break } if r == '\n' { t.loc.Eol() break } } // check for whitespace continue } } // not a comment, skipped whitespace, // now process this token break } // identifier or keywords if unicode.IsLetter(r) || r == '_' { t.unread() value, err := t.extract(func(r rune) bool { return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' }) if err != nil { return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) } // match keywords if tokenId, ok := keywords[value]; ok { return tokens.Token{Id: tokenId, Loc: loc, Value: value}, nil } // otherwise is an identitier return tokens.Token{Id: tokens.Ident, Loc: loc, Value: value}, nil } // character literal if r == '\'' { t.loc.Inc() value, err := t.extractEscaped('\'') if err != nil { return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) } if len(value) != 1 { return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character literal") } return tokens.Token{Id: tokens.Char, Loc: loc, Value: value}, nil } // numeric literal if unicode.IsDigit(r) { t.unread() pos := 0 hex := false bin := false value, err := t.extract(func(r rune) bool { if pos == 1 && r == 'x' { hex = true pos++ return true } if pos == 1 && r == 'b' { bin = true pos++ return true } if hex && unicode.In(r, unicode.ASCII_Hex_Digit) { pos++ return true } if bin && r != '0' && r != '1' { return false } pos++ return unicode.IsDigit(r) }) if (hex || bin) && len(value) == 2 { err = fmt.Errorf("invalid numeric format '%s'", value) } if err != nil { return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) } return tokens.Token{Id: tokens.Number, Loc: loc, Value: value}, nil } // string literal if r == '"' { t.loc.Inc() value, err := t.extractEscaped('"') if err != nil { return tok, errors.NewErrorWrap(errors.SyntaxError, t.loc, err, err.Error()) } return tokens.Token{Id: tokens.String, Loc: loc, Value: value}, nil } // two character symbols if d, ok := doubleChar[r]; ok { if p, err := t.peek(); err == nil { if second, ok := d.second[p]; ok { t.read() t.loc.Add(2) // FIXME: value return tokens.Token{Id: second, Loc: loc, Value: string(r) + string(p)}, nil } } } // single character symbols if tokenId, ok := singleChar[r]; ok { t.loc.Inc() return tokens.Token{Id: tokenId, Loc: loc, Value: string(r)}, nil } // invalid token return tok, errors.NewError(errors.SyntaxError, t.loc, "invalid character:", fmt.Sprintf("'%c'", r)) } func (t *Tokenizer) Scan() ([]tokens.Token, error) { var ts []tokens.Token for { token, err := t.token() if err != nil { return nil, err } ts = append(ts, token) if token.Id == tokens.Eof { return ts, nil } } }