mirror of
https://github.com/deuill/grawkit.git
synced 2024-09-28 08:22:46 +00:00
462 lines
9.8 KiB
Go
462 lines
9.8 KiB
Go
// Package lexer is an AWK lexer (tokenizer).
|
|
//
|
|
// The lexer turns a string of AWK source code into a stream of
|
|
// tokens for parsing.
|
|
//
|
|
// To tokenize some source, create a new lexer with NewLexer(src) and
|
|
// then call Scan() until the token type is EOF or ILLEGAL.
|
|
//
|
|
package lexer
|
|
|
|
import (
|
|
"fmt"
|
|
)
|
|
|
|
// Lexer tokenizes a byte string of AWK source code. Use NewLexer to
|
|
// actually create a lexer, and Scan() or ScanRegex() to get tokens.
|
|
type Lexer struct {
|
|
src []byte
|
|
offset int
|
|
ch byte
|
|
pos Position
|
|
nextPos Position
|
|
hadSpace bool
|
|
lastTok Token
|
|
}
|
|
|
|
// Position stores the source line and column where a token starts.
|
|
type Position struct {
|
|
// Line number of the token (starts at 1).
|
|
Line int
|
|
// Column on the line (starts at 1). Note that this is the byte
|
|
// offset into the line, not rune offset.
|
|
Column int
|
|
}
|
|
|
|
// NewLexer creates a new lexer that will tokenize the given source
|
|
// code. See the module-level example for a working example.
|
|
func NewLexer(src []byte) *Lexer {
|
|
l := &Lexer{src: src}
|
|
l.nextPos.Line = 1
|
|
l.nextPos.Column = 1
|
|
l.next()
|
|
return l
|
|
}
|
|
|
|
// HadSpace returns true if the previously-scanned token had
|
|
// whitespace before it. Used by the parser because when calling a
|
|
// user-defined function the grammar doesn't allow a space between
|
|
// the function name and the left parenthesis.
|
|
func (l *Lexer) HadSpace() bool {
|
|
return l.hadSpace
|
|
}
|
|
|
|
// Scan scans the next token and returns its position (line/column),
|
|
// token value (one of the uppercase token constants), and the
|
|
// string value of the token. For most tokens, the token value is
|
|
// empty. For NAME, NUMBER, STRING, and REGEX tokens, it's the
|
|
// token's value. For an ILLEGAL token, it's the error message.
|
|
func (l *Lexer) Scan() (Position, Token, string) {
|
|
pos, tok, val := l.scan()
|
|
l.lastTok = tok
|
|
return pos, tok, val
|
|
}
|
|
|
|
// Does the real work of scanning. Scan() wraps this to more easily
|
|
// set lastTok.
|
|
func (l *Lexer) scan() (Position, Token, string) {
|
|
// Skip whitespace (except newline, which is a token)
|
|
l.hadSpace = false
|
|
for l.ch == ' ' || l.ch == '\t' || l.ch == '\r' || l.ch == '\\' {
|
|
l.hadSpace = true
|
|
if l.ch == '\\' {
|
|
l.next()
|
|
if l.ch == '\r' {
|
|
l.next()
|
|
}
|
|
if l.ch != '\n' {
|
|
return l.pos, ILLEGAL, "expected \\n after \\ line continuation"
|
|
}
|
|
}
|
|
l.next()
|
|
}
|
|
if l.ch == '#' {
|
|
// Skip comment till end of line
|
|
l.next()
|
|
for l.ch != '\n' && l.ch != 0 {
|
|
l.next()
|
|
}
|
|
}
|
|
if l.ch == 0 {
|
|
// l.next() reached end of input
|
|
return l.pos, EOF, ""
|
|
}
|
|
|
|
pos := l.pos
|
|
tok := ILLEGAL
|
|
val := ""
|
|
|
|
ch := l.ch
|
|
l.next()
|
|
|
|
// Names: keywords and functions
|
|
if isNameStart(ch) {
|
|
start := l.offset - 2
|
|
for isNameStart(l.ch) || isDigit(l.ch) {
|
|
l.next()
|
|
}
|
|
name := string(l.src[start : l.offset-1])
|
|
tok := KeywordToken(name)
|
|
if tok == ILLEGAL {
|
|
tok = NAME
|
|
val = name
|
|
}
|
|
return pos, tok, val
|
|
}
|
|
|
|
// These are ordered by my guess at frequency of use. Should run
|
|
// through a corpus of real AWK programs to determine actual
|
|
// frequency.
|
|
switch ch {
|
|
case '$':
|
|
tok = DOLLAR
|
|
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
|
|
// Avoid make/append and use l.offset directly for performance
|
|
start := l.offset - 2
|
|
gotDigit := false
|
|
if ch != '.' {
|
|
gotDigit = true
|
|
for isDigit(l.ch) {
|
|
l.next()
|
|
}
|
|
if l.ch == '.' {
|
|
l.next()
|
|
}
|
|
}
|
|
for isDigit(l.ch) {
|
|
gotDigit = true
|
|
l.next()
|
|
}
|
|
if !gotDigit {
|
|
return l.pos, ILLEGAL, "expected digits"
|
|
}
|
|
if l.ch == 'e' || l.ch == 'E' {
|
|
l.next()
|
|
gotSign := false
|
|
if l.ch == '+' || l.ch == '-' {
|
|
gotSign = true
|
|
l.next()
|
|
}
|
|
gotDigit = false
|
|
for isDigit(l.ch) {
|
|
l.next()
|
|
gotDigit = true
|
|
}
|
|
// Per awk/gawk, "1e" is allowed and parsed as "1 e" (with "e"
|
|
// considered a variable). "1e+" is parsed as "1e + ...".
|
|
if !gotDigit {
|
|
if gotSign {
|
|
l.unread() // unread the '+' or '-'
|
|
}
|
|
l.unread() // unread the 'e' or 'E'
|
|
}
|
|
}
|
|
tok = NUMBER
|
|
val = string(l.src[start : l.offset-1])
|
|
case '{':
|
|
tok = LBRACE
|
|
case '}':
|
|
tok = RBRACE
|
|
case '=':
|
|
tok = l.choice('=', ASSIGN, EQUALS)
|
|
case '<':
|
|
tok = l.choice('=', LESS, LTE)
|
|
case '>':
|
|
switch l.ch {
|
|
case '=':
|
|
l.next()
|
|
tok = GTE
|
|
case '>':
|
|
l.next()
|
|
tok = APPEND
|
|
default:
|
|
tok = GREATER
|
|
}
|
|
case '"', '\'':
|
|
// Note: POSIX awk spec doesn't allow single-quoted strings,
|
|
// but this helps without quoting, especially on Windows
|
|
// where the shell quote character is " (double quote).
|
|
chars := make([]byte, 0, 32) // most won't require heap allocation
|
|
for l.ch != ch {
|
|
c := l.ch
|
|
if c == 0 {
|
|
return l.pos, ILLEGAL, "didn't find end quote in string"
|
|
}
|
|
if c == '\r' || c == '\n' {
|
|
return l.pos, ILLEGAL, "can't have newline in string"
|
|
}
|
|
if c != '\\' {
|
|
// Normal, non-escaped character
|
|
chars = append(chars, c)
|
|
l.next()
|
|
continue
|
|
}
|
|
// Escape sequence, skip over \ and process
|
|
l.next()
|
|
switch l.ch {
|
|
case 'n':
|
|
c = '\n'
|
|
l.next()
|
|
case 't':
|
|
c = '\t'
|
|
l.next()
|
|
case 'r':
|
|
c = '\r'
|
|
l.next()
|
|
case 'a':
|
|
c = '\a'
|
|
l.next()
|
|
case 'b':
|
|
c = '\b'
|
|
l.next()
|
|
case 'f':
|
|
c = '\f'
|
|
l.next()
|
|
case 'v':
|
|
c = '\v'
|
|
l.next()
|
|
case 'x':
|
|
// Hex byte of one of two hex digits
|
|
l.next()
|
|
digit := hexDigit(l.ch)
|
|
if digit < 0 {
|
|
return l.pos, ILLEGAL, "1 or 2 hex digits expected"
|
|
}
|
|
c = byte(digit)
|
|
l.next()
|
|
digit = hexDigit(l.ch)
|
|
if digit >= 0 {
|
|
c = c*16 + byte(digit)
|
|
l.next()
|
|
}
|
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
|
// Octal byte of 1-3 octal digits
|
|
c = l.ch - '0'
|
|
l.next()
|
|
for i := 0; i < 2 && l.ch >= '0' && l.ch <= '7'; i++ {
|
|
c = c*8 + l.ch - '0'
|
|
l.next()
|
|
}
|
|
default:
|
|
// Any other escape character is just the char
|
|
// itself, eg: "\z" is just "z"
|
|
c = l.ch
|
|
l.next()
|
|
}
|
|
chars = append(chars, c)
|
|
}
|
|
l.next()
|
|
tok = STRING
|
|
val = string(chars)
|
|
case '(':
|
|
tok = LPAREN
|
|
case ')':
|
|
tok = RPAREN
|
|
case ',':
|
|
tok = COMMA
|
|
case ';':
|
|
tok = SEMICOLON
|
|
case '+':
|
|
switch l.ch {
|
|
case '+':
|
|
l.next()
|
|
tok = INCR
|
|
case '=':
|
|
l.next()
|
|
tok = ADD_ASSIGN
|
|
default:
|
|
tok = ADD
|
|
}
|
|
case '-':
|
|
switch l.ch {
|
|
case '-':
|
|
l.next()
|
|
tok = DECR
|
|
case '=':
|
|
l.next()
|
|
tok = SUB_ASSIGN
|
|
default:
|
|
tok = SUB
|
|
}
|
|
case '*':
|
|
switch l.ch {
|
|
case '*':
|
|
l.next()
|
|
tok = l.choice('=', POW, POW_ASSIGN)
|
|
case '=':
|
|
l.next()
|
|
tok = MUL_ASSIGN
|
|
default:
|
|
tok = MUL
|
|
}
|
|
case '/':
|
|
tok = l.choice('=', DIV, DIV_ASSIGN)
|
|
case '%':
|
|
tok = l.choice('=', MOD, MOD_ASSIGN)
|
|
case '[':
|
|
tok = LBRACKET
|
|
case ']':
|
|
tok = RBRACKET
|
|
case '\n':
|
|
tok = NEWLINE
|
|
case '^':
|
|
tok = l.choice('=', POW, POW_ASSIGN)
|
|
case '!':
|
|
switch l.ch {
|
|
case '=':
|
|
l.next()
|
|
tok = NOT_EQUALS
|
|
case '~':
|
|
l.next()
|
|
tok = NOT_MATCH
|
|
default:
|
|
tok = NOT
|
|
}
|
|
case '~':
|
|
tok = MATCH
|
|
case '?':
|
|
tok = QUESTION
|
|
case ':':
|
|
tok = COLON
|
|
case '&':
|
|
tok = l.choice('&', ILLEGAL, AND)
|
|
if tok == ILLEGAL {
|
|
return l.pos, ILLEGAL, "unexpected char after '&'"
|
|
}
|
|
case '|':
|
|
tok = l.choice('|', PIPE, OR)
|
|
default:
|
|
tok = ILLEGAL
|
|
val = "unexpected char"
|
|
}
|
|
return pos, tok, val
|
|
}
|
|
|
|
// ScanRegex parses an AWK regular expression in /slash/ syntax. The
|
|
// AWK grammar has somewhat special handling of regex tokens, so the
|
|
// parser can only call this after a DIV or DIV_ASSIGN token has just
|
|
// been scanned.
|
|
func (l *Lexer) ScanRegex() (Position, Token, string) {
|
|
pos, tok, val := l.scanRegex()
|
|
l.lastTok = tok
|
|
return pos, tok, val
|
|
}
|
|
|
|
// Does the real work of scanning a regex. ScanRegex() wraps this to
|
|
// more easily set lastTok.
|
|
func (l *Lexer) scanRegex() (Position, Token, string) {
|
|
pos := l.pos
|
|
chars := make([]byte, 0, 32) // most won't require heap allocation
|
|
switch l.lastTok {
|
|
case DIV:
|
|
// Regex after '/' (the usual case)
|
|
pos.Column -= 1
|
|
case DIV_ASSIGN:
|
|
// Regex after '/=' (happens when regex starts with '=')
|
|
pos.Column -= 2
|
|
chars = append(chars, '=')
|
|
default:
|
|
return l.pos, ILLEGAL, fmt.Sprintf("unexpected %s preceding regex", l.lastTok)
|
|
}
|
|
for l.ch != '/' {
|
|
c := l.ch
|
|
if c == 0 {
|
|
return l.pos, ILLEGAL, "didn't find end slash in regex"
|
|
}
|
|
if c == '\r' || c == '\n' {
|
|
return l.pos, ILLEGAL, "can't have newline in regex"
|
|
}
|
|
if c == '\\' {
|
|
l.next()
|
|
if l.ch != '/' {
|
|
chars = append(chars, '\\')
|
|
}
|
|
c = l.ch
|
|
}
|
|
chars = append(chars, c)
|
|
l.next()
|
|
}
|
|
l.next()
|
|
return pos, REGEX, string(chars)
|
|
}
|
|
|
|
// Load the next character into l.ch (or 0 on end of input) and update
|
|
// line and column position.
|
|
func (l *Lexer) next() {
|
|
l.pos = l.nextPos
|
|
if l.offset >= len(l.src) {
|
|
// For last character, move offset 1 past the end as it
|
|
// simplifies offset calculations in NAME and NUMBER
|
|
if l.ch != 0 {
|
|
l.ch = 0
|
|
l.offset++
|
|
l.nextPos.Column++
|
|
}
|
|
return
|
|
}
|
|
ch := l.src[l.offset]
|
|
if ch == '\n' {
|
|
l.nextPos.Line++
|
|
l.nextPos.Column = 1
|
|
} else if ch != '\r' {
|
|
l.nextPos.Column++
|
|
}
|
|
l.ch = ch
|
|
l.offset++
|
|
}
|
|
|
|
// Un-read the character just scanned (doesn't handle line boundaries).
|
|
func (l *Lexer) unread() {
|
|
l.offset--
|
|
l.pos.Column--
|
|
l.nextPos.Column--
|
|
l.ch = l.src[l.offset-1]
|
|
}
|
|
|
|
func isNameStart(ch byte) bool {
|
|
return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
|
|
}
|
|
|
|
func isDigit(ch byte) bool {
|
|
return ch >= '0' && ch <= '9'
|
|
}
|
|
|
|
// Return the hex digit 0-15 corresponding to the given ASCII byte,
|
|
// or -1 if it's not a valid hex digit.
|
|
func hexDigit(ch byte) int {
|
|
switch {
|
|
case isDigit(ch):
|
|
return int(ch - '0')
|
|
case ch >= 'a' && ch <= 'f':
|
|
return int(ch - 'a' + 10)
|
|
case ch >= 'A' && ch <= 'F':
|
|
return int(ch - 'A' + 10)
|
|
default:
|
|
return -1
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) choice(ch byte, one, two Token) Token {
|
|
if l.ch == ch {
|
|
l.next()
|
|
return two
|
|
}
|
|
return one
|
|
}
|
|
|
|
// PeekByte returns the next unscanned byte; used when parsing
|
|
// "getline lvalue" expressions. Returns 0 at end of input.
|
|
func (l *Lexer) PeekByte() byte {
|
|
return l.ch
|
|
}
|