mirror of
https://github.com/rwxrob/bonzai
synced 2024-11-16 21:25:44 +00:00
Initial scanner/parser working
This commit is contained in:
parent
8849731a9b
commit
5b34312df0
70
parser/cur.go
Normal file
70
parser/cur.go
Normal file
@ -0,0 +1,70 @@
|
||||
// Copyright 2022 Robert S. Muhlestein.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
// Pos contains the user-land position for reporting back when there is
|
||||
// a problem with parsing. This is different from internal position
|
||||
// information used to parse from the internal data buffer. The
|
||||
// Parser.NewLine() method should always increment the Pos.Line and set
|
||||
// Pos.LineRune and Pos.LineByte.
|
||||
type Pos struct {
|
||||
Line int // lines (rows) starting at 1
|
||||
LineRune int // offset rune in line starting at 1
|
||||
LineByte int // offset byte in line starting at 1
|
||||
Rune int // offset rune pos starting with 1
|
||||
}
|
||||
|
||||
// Cur is a cursor structure that points to specific position within
|
||||
// buffered data. An internal Cur is used within the Parser. Cursors
|
||||
// are returned by methods of Parser implementations. Cursors must be
|
||||
// set to EOD and have Len set to 0 if the Parser is asked to read
|
||||
// beyond the end of the data. Manipulating the values of a Curs
|
||||
// directly is strongly discouraged (but not worth the performance hit
|
||||
// of stopping it with interface encapsulation).
|
||||
type Cur struct {
|
||||
Pos
|
||||
Rune rune // last rune decoded
|
||||
Byte int // beginning of last rune decoded
|
||||
Len int // length of last rune decoded (0-4)
|
||||
Next int // beginning of next rune to decode
|
||||
}
|
||||
|
||||
// NewLine forces the internal cursor position to increment it's
|
||||
// user-land line counter and reset LineRune and LineByte to 1. This
|
||||
// enables specific grammar parser implementations to explicitly create
|
||||
// a new line as defined by that grammar.
|
||||
func (c *Cur) NewLine() {
|
||||
c.Pos.Line++
|
||||
c.Pos.LineRune = 1
|
||||
c.Pos.LineByte = 1
|
||||
}
|
||||
|
||||
// String fulfills the fmt.Stringer interface by printing
|
||||
// the Cursor's location in a human-friendly way:
|
||||
//
|
||||
// U+1F47F '👿' 1,3-5 (3-5)
|
||||
// | | | | |
|
||||
// line | | | overall byte offset
|
||||
// line rune offset | overall rune offset
|
||||
// line byte offset
|
||||
//
|
||||
func (c *Cur) String() string {
|
||||
if c == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
if c.Rune == EOD {
|
||||
return "<EOD>"
|
||||
}
|
||||
s := fmt.Sprintf(`%U %q %v,%v-%v (%v-%v)`,
|
||||
c.Rune, c.Rune,
|
||||
c.Pos.Line, c.Pos.LineRune, c.Pos.LineByte,
|
||||
c.Pos.Rune, c.Byte+1,
|
||||
)
|
||||
return s
|
||||
}
|
||||
|
||||
// Print prints the cursor itself in String form. See String.
|
||||
func (c *Cur) Print() { fmt.Println(c.String()) }
|
10
parser/cur_test.go
Normal file
10
parser/cur_test.go
Normal file
@ -0,0 +1,10 @@
|
||||
package parser_test
|
||||
|
||||
import "github.com/rwxrob/bonzai/parser"
|
||||
|
||||
func ExampleCur() {
|
||||
m := new(parser.Cur)
|
||||
m.Print()
|
||||
//Output:
|
||||
// U+0000 '\x00' 0,0-0 (0-1)
|
||||
}
|
356
parser/parser.go
Normal file
356
parser/parser.go
Normal file
@ -0,0 +1,356 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Parser must implement a parser with all the trimmings. See New for
|
||||
// a creating a specific instance of Parser.
|
||||
type Parser interface {
|
||||
Check(ms ...any) (*Cur, error)
|
||||
Cur() *Cur
|
||||
CopyCur() *Cur
|
||||
Done() bool
|
||||
ErrorExpected(this any) error
|
||||
Expect(ms ...any) (*Cur, error)
|
||||
Init(i any)
|
||||
Jump(m *Cur)
|
||||
Move(n int)
|
||||
NewLine()
|
||||
Next()
|
||||
Look(to *Cur) string
|
||||
LookSlice(beg *Cur, end *Cur) string
|
||||
Print()
|
||||
String() string
|
||||
}
|
||||
|
||||
// EOD is a special value that is returned when the end of data is
|
||||
// reached enabling functional parser functions to look for it reliably
|
||||
// no matter what is being parsed.
|
||||
const EOD = 1<<31 - 1 // max int32
|
||||
|
||||
// see Parser interface
|
||||
type parser struct {
|
||||
in io.Reader
|
||||
buf []byte
|
||||
cur *Cur
|
||||
}
|
||||
|
||||
// New returns a newly initialized rune-centric Parser with support for
|
||||
// parsing data from io.Reader, string, and []byte types. See the Parser
|
||||
// interface and Init method.
|
||||
func New(i interface{}) *parser {
|
||||
p := new(parser)
|
||||
if err := p.Init(i); err != nil {
|
||||
return p
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Init reads all of passed parsable (io.Reader, string, []byte) into
|
||||
// memory, parses the first rune, and sets the internals of parser
|
||||
// appropriately returning an error if anything happens while attempting
|
||||
// to read and buffer the data (OOM, etc.).
|
||||
func (p *parser) Init(i interface{}) error {
|
||||
if err := p.buffer(i); err != nil {
|
||||
return err
|
||||
}
|
||||
r, ln := utf8.DecodeRune(p.buf) // scan first
|
||||
if ln == 0 {
|
||||
r = EOD
|
||||
return fmt.Errorf("parser: failed to scan first rune")
|
||||
}
|
||||
p.cur = new(Cur)
|
||||
p.cur.Rune = r
|
||||
p.cur.Len = ln
|
||||
p.cur.Next = ln
|
||||
p.cur.Pos.Line = 1
|
||||
p.cur.Pos.LineRune = 1
|
||||
p.cur.Pos.LineByte = 1
|
||||
p.cur.Pos.Rune = 1
|
||||
return nil
|
||||
}
|
||||
|
||||
// reads and buffers io.Reader, string, or []byte types
|
||||
func (p *parser) buffer(i interface{}) error {
|
||||
var err error
|
||||
switch in := i.(type) {
|
||||
case io.Reader:
|
||||
p.buf, err = io.ReadAll(in)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case string:
|
||||
p.buf = []byte(in)
|
||||
case []byte:
|
||||
p.buf = in
|
||||
default:
|
||||
return fmt.Errorf("parser: unsupported input type: %t", i)
|
||||
}
|
||||
if len(p.buf) == 0 {
|
||||
return fmt.Errorf("parser: no input")
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Next parses the next rune advancing a single rune forward or sets
|
||||
// current cursor rune to EOD if at end of data. Returns p.Done() if
|
||||
// attempted after already at end of data.
|
||||
func (p *parser) Next() {
|
||||
if p.Done() {
|
||||
return
|
||||
}
|
||||
r, ln := utf8.DecodeRune(p.buf[p.cur.Next:])
|
||||
if ln != 0 {
|
||||
p.cur.Byte = p.cur.Next
|
||||
p.cur.Pos.LineByte += p.cur.Len
|
||||
} else {
|
||||
r = EOD
|
||||
}
|
||||
p.cur.Rune = r
|
||||
p.cur.Pos.Rune += 1
|
||||
p.cur.Next += ln
|
||||
p.cur.Pos.LineRune += 1
|
||||
p.cur.Len = ln
|
||||
}
|
||||
func (p *parser) Move(n int) {
|
||||
for i := 0; i < n; i++ {
|
||||
p.Next()
|
||||
}
|
||||
}
|
||||
|
||||
// Done returns true if current cursor rune is EOD and the cursor length
|
||||
// is also zero.
|
||||
func (p *parser) Done() bool {
|
||||
return p.cur.Rune == EOD && p.cur.Len == 0
|
||||
}
|
||||
|
||||
// String delegates to internal cursor String.
|
||||
func (p *parser) String() string { return p.cur.String() }
|
||||
|
||||
// Print delegates to internal cursor Print.
|
||||
func (p *parser) Print() { p.cur.Print() }
|
||||
|
||||
// Cur returns exact cursor used by parser. See CopyCur and Cur struct.
|
||||
func (p *parser) Cur() *Cur { return p.cur }
|
||||
|
||||
// CopyCur returns a copy of the current parser cursor. See Cur.
|
||||
func (p *parser) CopyCur() *Cur {
|
||||
if p.cur == nil {
|
||||
return nil
|
||||
}
|
||||
// force a copy
|
||||
cp := *p.cur
|
||||
return &cp
|
||||
}
|
||||
|
||||
// Jump replaces the internal cursor with a copy of the one passed
|
||||
// effectively repositioning the parser's current position in the
|
||||
// buffered data.
|
||||
func (p *parser) Jump(c *Cur) { nc := *c; p.cur = &nc }
|
||||
|
||||
// Look returns a string containing all the bytes from the current
|
||||
// parser cursor position up to the passed cursor position, forward or
|
||||
// backward. Neither the internal nor the passed cursor position is
|
||||
// changed.
|
||||
func (p *parser) Look(to *Cur) string {
|
||||
if to.Byte < p.cur.Byte {
|
||||
return string(p.buf[to.Byte:p.cur.Next])
|
||||
}
|
||||
return string(p.buf[p.cur.Byte:to.Next])
|
||||
}
|
||||
|
||||
// LookSlice returns a string containing all the bytes from the first
|
||||
// cursor up to the second cursor. Neither cursor position is changed.
|
||||
func (p *parser) LookSlice(beg *Cur, end *Cur) string {
|
||||
return string(p.buf[beg.Byte:end.Next])
|
||||
}
|
||||
|
||||
// Expect takes a variable list of parsable types including rune,
|
||||
// string, Class, Check, Opt, Not, Seq, One, Min, MinMax, Count. This
|
||||
// allows grammars to be represented simply and parsed easily without
|
||||
// exceptional overhead from additional function calls and indirection.
|
||||
func (p *parser) Expect(ms ...interface{}) (*Cur, error) {
|
||||
var beg, end *Cur
|
||||
beg = p.Cur()
|
||||
for _, m := range ms {
|
||||
switch v := m.(type) {
|
||||
|
||||
case rune:
|
||||
if p.cur.Rune != v {
|
||||
err := p.ErrorExpected(m)
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = p.CopyCur()
|
||||
p.Next()
|
||||
|
||||
case string:
|
||||
if v == "" {
|
||||
return nil, fmt.Errorf("expect: cannot parse empty string")
|
||||
}
|
||||
for _, r := range []rune(v) {
|
||||
if p.cur.Rune != r {
|
||||
err := p.ErrorExpected(r)
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = p.CopyCur()
|
||||
p.Next()
|
||||
}
|
||||
/*
|
||||
case Class:
|
||||
if !v.Check(p.cur.Rune) {
|
||||
err := p.ErrorExpected(v)
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = p.CopyCur()
|
||||
p.Next()
|
||||
|
||||
case Check:
|
||||
rv, err := v.Check(p)
|
||||
if err != nil {
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = rv
|
||||
p.Jump(rv)
|
||||
p.Next()
|
||||
|
||||
case is.Opt:
|
||||
m, err := p.Expect(is.MinMax{v.This, 0, 1})
|
||||
if err != nil {
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = m
|
||||
|
||||
case is.Not:
|
||||
if _, e := p.Check(v.This); e == nil {
|
||||
err := p.ErrorExpected(v)
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = p.CopyCur()
|
||||
|
||||
case is.Min:
|
||||
c := 0
|
||||
last := p.CopyCur()
|
||||
var err error
|
||||
var m *Cur
|
||||
for {
|
||||
m, err = p.Expect(v.Match)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
last = m
|
||||
c++
|
||||
}
|
||||
if c < v.Min {
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = last
|
||||
|
||||
case is.Count:
|
||||
m, err := p.Expect(is.MinMax{v.Match, v.Count, v.Count})
|
||||
if err != nil {
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = m
|
||||
|
||||
case is.MinMax:
|
||||
c := 0
|
||||
last := p.CopyCur()
|
||||
var err error
|
||||
var m *Cur
|
||||
for {
|
||||
m, err = p.Expect(v.Match)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
last = m
|
||||
c++
|
||||
}
|
||||
if c == 0 && v.Min == 0 {
|
||||
if end == nil {
|
||||
end = last
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !(v.Min <= c && c <= v.Max) {
|
||||
p.Jump(last)
|
||||
return nil, err
|
||||
}
|
||||
end = last
|
||||
|
||||
case is.Seq:
|
||||
m, err := p.Expect(v...)
|
||||
if err != nil {
|
||||
p.Jump(beg)
|
||||
return nil, err
|
||||
}
|
||||
end = m
|
||||
|
||||
case is.OneOf:
|
||||
var m *Cur
|
||||
var err error
|
||||
for _, i := range v {
|
||||
last := p.CopyCur()
|
||||
m, err = p.Expect(i)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
p.Jump(last)
|
||||
}
|
||||
if m == nil {
|
||||
return nil, p.ErrorExpected(v)
|
||||
}
|
||||
end = m
|
||||
*/
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("expect: unsupported argument type (%T)", m)
|
||||
}
|
||||
}
|
||||
return end, nil
|
||||
}
|
||||
|
||||
func (p *parser) ErrorExpected(this interface{}) error {
|
||||
var msg string
|
||||
but := fmt.Sprintf(` but got %v`, p)
|
||||
if p.Done() {
|
||||
runes := `runes`
|
||||
if p.cur.Pos.Rune == 1 {
|
||||
runes = `rune`
|
||||
}
|
||||
but = fmt.Sprintf(` but exceeded data length (%v %v)`, p.cur.Pos.Rune, runes)
|
||||
}
|
||||
// TODO add verbose errors for *all* types in Grammar
|
||||
switch v := this.(type) {
|
||||
case string:
|
||||
msg = fmt.Sprintf(`expected string %q`, v)
|
||||
case rune:
|
||||
msg = fmt.Sprintf(`expected rune %q`, v)
|
||||
/*
|
||||
case Class:
|
||||
msg = fmt.Sprintf(`expected class %v (%v)`, v.Ident(), v.Desc())
|
||||
default:
|
||||
msg = fmt.Sprintf(`expected %T %q`, v, v)
|
||||
*/
|
||||
}
|
||||
return errors.New(msg + but)
|
||||
}
|
||||
|
||||
// NewLine delegates to interval Curs.NewLine.
|
||||
func (p *parser) NewLine() { p.cur.NewLine() }
|
||||
|
||||
func (p *parser) Check(ms ...interface{}) (*Cur, error) {
|
||||
defer p.Jump(p.CopyCur())
|
||||
return p.Expect(ms...)
|
||||
}
|
Loading…
Reference in New Issue
Block a user