Peeking lexer optimizations (#263)

* Provide PeekingLexer.Checkpoint as an allocation-free Clone alternative The state itself is still in private fields, thus Checkpoint being a public field doesn't break encapsulation. * Remove PeekingLexer.Clone and avoid an allocation when branching Roughly 5% less CPU time and 12% fewer allocations in the Thrift benchmark with generated lexer. * Optimize PeekingLexer to provide constant-time Peek The attached Benchmark is over 3x faster after the optimization * Add methods to create and restore PeekingLexer checkpoints
alecthomas · Dec 3, 2022 · 5adbb7c · 5adbb7c
1 parent e748387
commit 5adbb7c
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 60 deletions.
diff --git a/context.go b/context.go
@@ -18,7 +18,7 @@ type contextFieldSet struct {
 
 // Context for a single parse.
 type parseContext struct {
-	*lexer.PeekingLexer
+	lexer.PeekingLexer
 	depth             int
 	trace             io.Writer
 	deepestError      error
@@ -31,7 +31,7 @@ type parseContext struct {
 
 func newParseContext(lex *lexer.PeekingLexer, lookahead int, caseInsensitive map[lexer.TokenType]bool) *parseContext {
 	return &parseContext{
-		PeekingLexer:    lex,
+		PeekingLexer:    *lex,
 		caseInsensitive: caseInsensitive,
 		lookahead:       lookahead,
 	}
@@ -78,7 +78,6 @@ func (p *parseContext) Branch() *parseContext {
 	branch := &parseContext{}
 	*branch = *p
 	branch.apply = nil
-	branch.PeekingLexer = p.PeekingLexer.Clone()
 	return branch
 }
 

diff --git a/lexer/peek.go b/lexer/peek.go
@@ -2,16 +2,23 @@ package lexer
 
 // PeekingLexer supports arbitrary lookahead as well as cloning.
 type PeekingLexer struct {
-	rawCursor RawCursor
-	cursor    int
-	eof       Token
-	tokens    []Token
-	elide     map[TokenType]bool
+	Checkpoint
+	tokens []Token
+	elide  map[TokenType]bool
 }
 
 // RawCursor index in the token stream.
 type RawCursor int
 
+// Checkpoint wraps the mutable state of the PeekingLexer.
+//
+// Copying and restoring just this state is a bit faster than copying the entire PeekingLexer.
+type Checkpoint struct {
+	rawCursor  RawCursor // The raw position of the next possibly elided token
+	nextCursor RawCursor // The raw position of the next non-elided token
+	cursor     int       // Index of the next non-elided token among other non-elided tokens
+}
+
 // Upgrade a Lexer to a PeekingLexer with arbitrary lookahead.
 //
 // "elide" is a slice of token types to elide from processing.
@@ -27,12 +34,12 @@ func Upgrade(lex Lexer, elide ...TokenType) (*PeekingLexer, error) {
 		if err != nil {
 			return r, err
 		}
+		r.tokens = append(r.tokens, t)
 		if t.EOF() {
-			r.eof = t
 			break
 		}
-		r.tokens = append(r.tokens, t)
 	}
+	r.advanceToNonElided()
 	return r, nil
 }
 
@@ -42,39 +49,48 @@ func (p *PeekingLexer) Range(rawStart, rawEnd RawCursor) []Token {
 }
 
 // Cursor position in tokens, excluding elided tokens.
-func (p *PeekingLexer) Cursor() int {
-	return p.cursor
+func (c Checkpoint) Cursor() int {
+	return c.cursor
 }
 
 // RawCursor position in tokens, including elided tokens.
-func (p *PeekingLexer) RawCursor() RawCursor {
-	return p.rawCursor
+func (c Checkpoint) RawCursor() RawCursor {
+	return c.rawCursor
 }
 
 // Next consumes and returns the next token.
 func (p *PeekingLexer) Next() *Token {
-	for int(p.rawCursor) < len(p.tokens) {
-		t := &p.tokens[p.rawCursor]
-		p.rawCursor++
-		if p.elide[t.Type] {
-			continue
-		}
-		p.cursor++
+	t := &p.tokens[p.nextCursor]
+	if t.EOF() {
 		return t
 	}
-	return &p.eof
+	p.nextCursor++
+	p.rawCursor = p.nextCursor
+	p.cursor++
+	p.advanceToNonElided()
+	return t
 }
 
-// Peek ahead at the next token.
+// Peek ahead at the next non-elided token.
 func (p *PeekingLexer) Peek() *Token {
-	for i := int(p.rawCursor); i < len(p.tokens); i++ {
-		t := &p.tokens[i]
-		if p.elide[t.Type] {
-			continue
+	return &p.tokens[p.nextCursor]
+}
+
+// RawPeek peeks ahead at the next raw token.
+//
+// Unlike Peek, this will include elided tokens.
+func (p *PeekingLexer) RawPeek() *Token {
+	return &p.tokens[p.rawCursor]
+}
+
+// advanceToNonElided advances nextCursor to the closest non-elided token
+func (p *PeekingLexer) advanceToNonElided() {
+	for ; ; p.nextCursor++ {
+		t := &p.tokens[p.nextCursor]
+		if t.EOF() || !p.elide[t.Type] {
+			return
 		}
-		return t
 	}
-	return &p.eof
 }
 
 // PeekAny peeks forward over elided and non-elided tokens.
@@ -85,42 +101,33 @@ func (p *PeekingLexer) Peek() *Token {
 // The returned RawCursor position is the location of the returned token.
 // Use FastForward to move the internal cursors forward.
 func (p *PeekingLexer) PeekAny(match func(Token) bool) (t Token, rawCursor RawCursor) {
-	tokenCount := RawCursor(len(p.tokens))
-	for i := p.rawCursor; i < tokenCount; i++ {
+	for i := p.rawCursor; ; i++ {
 		t = p.tokens[i]
-		if match(t) || !p.elide[t.Type] {
+		if t.EOF() || match(t) || !p.elide[t.Type] {
 			return t, i
 		}
 	}
-	return p.eof, tokenCount
 }
 
 // FastForward the internal cursors to this RawCursor position.
 func (p *PeekingLexer) FastForward(rawCursor RawCursor) {
-	tokenCount := RawCursor(len(p.tokens))
-	for ; p.rawCursor <= rawCursor && p.rawCursor < tokenCount; p.rawCursor++ {
-		t := p.tokens[p.rawCursor]
-		if p.elide[t.Type] {
-			continue
+	for ; p.rawCursor <= rawCursor; p.rawCursor++ {
+		t := &p.tokens[p.rawCursor]
+		if t.EOF() {
+			break
+		}
+		if !p.elide[t.Type] {
+			p.cursor++
 		}
-		p.cursor++
 	}
+	p.nextCursor = p.rawCursor
+	p.advanceToNonElided()
 }
 
-// RawPeek peeks ahead at the next raw token.
-//
-// Unlike Peek, this will include elided tokens.
-func (p *PeekingLexer) RawPeek() *Token {
-	if int(p.rawCursor) < len(p.tokens) {
-		return &p.tokens[p.rawCursor]
-	}
-	return &p.eof
+func (p *PeekingLexer) MakeCheckpoint() Checkpoint {
+	return p.Checkpoint
 }
 
-// Clone creates a clone of this PeekingLexer at its current token.
-//
-// The parent and clone are completely independent.
-func (p *PeekingLexer) Clone() *PeekingLexer {
-	clone := *p
-	return &clone
+func (p *PeekingLexer) LoadCheckpoint(checkpoint Checkpoint) {
+	p.Checkpoint = checkpoint
 }
diff --git a/lexer/peek_test.go b/lexer/peek_test.go
@@ -4,6 +4,7 @@ import (
 	"testing"
 
 	require "github.com/alecthomas/assert/v2"
+
 	"github.com/alecthomas/participle/v2/lexer"
 )
 
@@ -32,7 +33,7 @@ func TestUpgrade(t *testing.T) {
 	require.Equal(t, tokens, l.Range(0, 3))
 }
 
-func TestPeekAndNextAny(t *testing.T) {
+func TestPeekingLexer_Peek_Next_Checkpoint(t *testing.T) {
 	slexdef := lexer.MustSimple([]lexer.SimpleRule{
 		{"Ident", `\w+`},
 		{"Whitespace", `\s+`},
@@ -48,7 +49,25 @@ func TestPeekAndNextAny(t *testing.T) {
 		{Type: -3, Value: " ", Pos: lexer.Position{Line: 1, Column: 12, Offset: 11}},
 		{Type: -2, Value: "last", Pos: lexer.Position{Line: 1, Column: 13, Offset: 12}},
 	}
-	tok := plex.Next()
-	require.Equal(t, expected[0], *tok)
+	checkpoint := plex.Checkpoint
+	require.Equal(t, expected[0], *plex.Next())
 	require.Equal(t, expected[2], *plex.Peek(), "should have skipped whitespace")
+	plex.Checkpoint = checkpoint
+	require.Equal(t, expected[0], *plex.Peek(), "should have reverted to pre-Next state")
+}
+
+func BenchmarkPeekingLexer_Peek(b *testing.B) {
+	tokens := []lexer.Token{{Type: 1, Value: "x"}, {Type: 3, Value: " "}, {Type: 2, Value: "y"}}
+	l, err := lexer.Upgrade(&staticLexer{tokens: tokens}, 3)
+	require.NoError(b, err)
+	l.Next()
+	t := l.Peek()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		t = l.Peek()
+		if t.EOF() {
+			return
+		}
+	}
+	require.Equal(b, lexer.Token{Type: 2, Value: "y"}, *t)
 }
diff --git a/nodes.go b/nodes.go
@@ -63,7 +63,7 @@ func (p *parseable) Parse(ctx *parseContext, parent reflect.Value) (out []reflec
 	defer ctx.printTrace(p)()
 	rv := reflect.New(p.t)
 	v := rv.Interface().(Parseable)
-	err = v.Parse(ctx.PeekingLexer)
+	err = v.Parse(&ctx.PeekingLexer)
 	if err != nil {
 		if err == NextMatch {
 			return nil, nil
@@ -84,7 +84,7 @@ func (c *custom) GoString() string { return c.typ.Name() }
 
 func (c *custom) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	defer ctx.printTrace(c)()
-	results := c.parseFn.Call([]reflect.Value{reflect.ValueOf(ctx.PeekingLexer)})
+	results := c.parseFn.Call([]reflect.Value{reflect.ValueOf(&ctx.PeekingLexer)})
 	if err, _ := results[1].Interface().(error); err != nil {
 		if err == NextMatch {
 			return nil, nil

diff --git a/parser.go b/parser.go
@@ -169,7 +169,7 @@ func (p *Parser[G]) ParseFromLexer(lex *lexer.PeekingLexer, options ...ParseOpti
 		}
 	}
 	ctx := newParseContext(lex, p.useLookahead, caseInsensitive)
-	defer func() { *lex = *ctx.PeekingLexer }()
+	defer func() { *lex = ctx.PeekingLexer }()
 	for _, option := range options {
 		option(ctx)
 	}
@@ -268,7 +268,7 @@ func (p *Parser[G]) parseInto(ctx *parseContext, parseNode node, rv reflect.Valu
 }
 
 func (p *Parser[G]) rootParseable(ctx *parseContext, parseable Parseable) error {
-	if err := parseable.Parse(ctx.PeekingLexer); err != nil {
+	if err := parseable.Parse(&ctx.PeekingLexer); err != nil {
 		if err == NextMatch {
 			err = &UnexpectedTokenError{Unexpected: *ctx.Peek()}
 		} else {