Newer
Older
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression library.
package regexp
import os "os"
export var ErrUnimplemented = os.NewError("unimplemented");
export var ErrInternal = os.NewError("internal error");
export var ErrUnmatchedLpar = os.NewError("unmatched '('");
export var ErrUnmatchedRpar = os.NewError("unmatched ')'");
export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
export var ErrEmpty = os.NewError("empty subexpression or alternation");
export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
export var ErrBareClosure = os.NewError("closure applies to nothing");
export var ErrBadBackslash = os.NewError("illegal backslash escape");
// An instruction executed by the NFA
type Inst interface {
Type() int; // the type of this instruction: CHAR, ANY, etc.
Next() Inst; // the instruction to execute after this one
SetNext(i Inst);
Print(ind string);
}
type RE struct {
expr string; // the original expression
ch *chan<- *RE; // reply channel when we're done
error *os.Error; // compile- or run-time error; nil if OK
start Inst;
}
const (
START // beginning of program: marker to start
= iota;
END; // end of program: success
BOT; // '^' beginning of text
EOT; // '$' end of text
CHAR; // 'a' regular character
ANY; // '.' any character
BRA; // '(' parenthesized expression
EBRA; // ')'; end of '(' parenthesized expression
ALT; // '|' alternation
NOP; // do nothing; makes it easy to link without patching
)
// --- START start of program
type Start struct {
}
func (start *Start) Type() int { return START }
func (start *Start) Next() Inst { return start.next }
func (start *Start) SetNext(i Inst) { start.next = i }
func (start *Start) Print(ind string) { print(ind, "start") }
// --- END end of program
type End struct {
}
func (end *End) Type() int { return END }
func (end *End) Next() Inst { return end.next }
func (end *End) SetNext(i Inst) { end.next = i }
func (end *End) Print(ind string) { print(ind, "end") }
// --- BOT beginning of text
type Bot struct {
}
func (bot *Bot) Type() int { return BOT }
func (bot *Bot) Next() Inst { return bot.next }
func (bot *Bot) SetNext(i Inst) { bot.next = i }
func (bot *Bot) Print(ind string) { print(ind, "bot") }
// --- EOT end of text
type Eot struct {
}
func (eot *Eot) Type() int { return EOT }
func (eot *Eot) Next() Inst { return eot.next }
func (eot *Eot) SetNext(i Inst) { eot.next = i }
func (eot *Eot) Print(ind string) { print(ind, "eot") }
// --- CHAR a regular character
type Char struct {
char int;
set bool;
}
func (char *Char) Type() int { return CHAR }
func (char *Char) Next() Inst { return char.next }
func (char *Char) SetNext(i Inst) { char.next = i }
func (char *Char) Print(ind string) { print(ind, "char ", string(char.char)) }
func NewChar(char int) *Char {
c := new(Char);
c.char = char;
return c;
}
// --- ANY any character
type Any struct {
}
func (any *Any) Type() int { return ANY }
func (any *Any) Next() Inst { return any.next }
func (any *Any) SetNext(i Inst) { any.next = i }
func (any *Any) Print(ind string) { print(ind, "any") }
// --- BRA parenthesized expression
type Bra struct {
next Inst;
n int; // subexpression number
}
func (bra *Bra) Type() int { return BRA }
func (bra *Bra) Next() Inst { return bra.next }
func (bra *Bra) SetNext(i Inst) { bra.next = i }
func (bra *Bra) Print(ind string) { print(ind , "bra"); }
// --- EBRA end of parenthesized expression
type Ebra struct {
n int; // subexpression number
}
func (ebra *Ebra) Type() int { return BRA }
func (ebra *Ebra) Next() Inst { return ebra.next }
func (ebra *Ebra) SetNext(i Inst) { ebra.next = i }
func (ebra *Ebra) Print(ind string) { print(ind , "ebra ", ebra.n); }
// --- ALT alternation
type Alt struct {
next Inst;
left Inst; // other branch
}
func (alt *Alt) Type() int { return ALT }
func (alt *Alt) Next() Inst { return alt.next }
func (alt *Alt) SetNext(i Inst) { alt.next = i }
func (alt *Alt) Print(ind string) { print(ind , "alt(", alt.left, ")"); }
// --- NOP no operation
type Nop struct {
}
func (nop *Nop) Type() int { return NOP }
func (nop *Nop) Next() Inst { return nop.next }
func (nop *Nop) SetNext(i Inst) { nop.next = i }
func (nop *Nop) Print(ind string) { print(ind, "nop") }
// report error and exit compiling/executing goroutine
func (re *RE) Error(err *os.Error) {
re.error = err;
re.ch <- re;
sys.goexit();
}
type Parser struct {
re *RE;
nbra int; // number of brackets in expression, for subexpressions
nlpar int; // number of unclosed lpars
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
pos int;
ch int;
}
const EOF = -1
func (p *Parser) c() int {
return p.ch;
}
func (p *Parser) nextc() int {
if p.pos >= len(p.re.expr) {
p.ch = EOF
} else {
c, w := sys.stringtorune(p.re.expr, p.pos); // TODO: stringotorune shoudl take a string*
p.ch = c;
p.pos += w;
}
return p.ch;
}
func NewParser(re *RE) *Parser {
parser := new(Parser);
parser.re = re;
parser.nextc(); // load p.ch
return parser;
}
/*
Grammar:
regexp:
concatenation { '|' concatenation }
concatenation:
{ closure }
closure:
term { '*' | '+' | '?' }
term:
'.'
character
characterclass
'(' regexp ')'
*/
func (p *Parser) Regexp() (start, end Inst)
var NULL Inst
type BUGinter interface{}
// same as i == NULL. TODO: remove when 6g lets me do i == NULL
func isNULL(i Inst) bool {
return sys.BUG_intereq(i.(BUGinter), NULL.(BUGinter))
}
// same as i == j. TODO: remove when 6g lets me do i == j
func isEQ(i,j Inst) bool {
return sys.BUG_intereq(i.(BUGinter), j.(BUGinter))
}
func special(c int) bool {
s := `\.+*?()|[-]`;
for i := 0; i < len(s); i++ {
if c == int(s[i]) {
return true
}
}
return false
}
func (p *Parser) Term() (start, end Inst) {
return NULL, NULL;
case '*', '+', '|':
p.re.Error(ErrBareClosure);
if p.nlpar == 0 {
p.re.Error(ErrUnmatchedRpar);
}
return NULL, NULL;
return start, start;
case '(':
p.nextc();
start, end = p.Regexp();
if p.c() != ')' {
p.re.Error(ErrUnmatchedLpar);
}
p.nextc();
p.nbra++;
bra := new(Bra);
ebra := new(Ebra);
if isNULL(start) {
if !isNULL(end) { p.re.Error(ErrInternal) }
start = ebra
case '\\':
c = p.nextc();
switch {
case c == EOF:
p.re.Error(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
case special(c):
// c is as delivered
default:
p.re.Error(ErrBadBackslash);
}
fallthrough;
default:
p.nextc();
return start, start
}
panic("unreachable");
}
func (p *Parser) Closure() (start, end Inst) {
return start, end
}
switch p.c() {
case '*':
// (start,end)*:
alt := new(Alt);
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
start = alt; // alt becomes new (start, end)
end = alt;
case '+':
// (start,end)+:
alt := new(Alt);
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
end = alt; // start is unchanged; end is alt
case '?':
// (start,end)?:
alt := new(Alt);
nop := new(Nop);
alt.left = start; // alternate branch is start
alt.next = nop; // follow on to nop
end.SetNext(nop); // after end, go to nop
start = alt; // start is now alt
end = nop; // end is nop pointed to by both branches
default:
return start, end;
}
switch p.nextc() {
case '*', '+', '?':
p.re.Error(ErrBadClosure);
}
return start, end;
}
func (p *Parser) Concatenation() (start, end Inst) {
switch p.c() {
case '|', ')', EOF:
if isNULL(start) { // this is the empty string
nop := new(Nop);
return nop, nop;
}
return start, end;
}
nstart, nend := p.Closure();
switch {
case isNULL(nstart): // end of this concatenation
case isNULL(start): // this is first element of concatenation
start, end = nstart, nend;
default:
end = nend;
}
}
panic("unreachable");
}
func (p *Parser) Regexp() (start, end Inst) {
return NULL, NULL
}
for {
switch p.c() {
default:
return start, end;
case '|':
p.nextc();
nstart, nend := p.Concatenation();
// xyz|(nothing) is xyz or nop
if isNULL(nstart) {
nop := new(Nop);
nstart, nend = nop, nop;
}
alt := new(Alt);
alt.left = start;
alt.next = nstart;
nop := new(Nop);
end.SetNext(nop);
nend.SetNext(nop);
start, end = alt, nop;
}
}
panic("unreachable");
}
func UnNop(i Inst) Inst {
for i.Type() == NOP {
i = i.Next()
func (re *RE) EliminateNops(start Inst) {
for i := start; i.Type() != END; i = i.Next() { // last one is END
i.SetNext(UnNop(i.Next()));
if i.Type() == ALT {
alt := i.(*Alt);
alt.left = UnNop(alt.left);
re.EliminateNops(alt.left);
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
// use a 'done' array to know where we've already printed.
// the output is not pretty but it is serviceable.
func (re *RE) Dump(ind string, inst Inst, done *[]Inst) {
// see if we've been here, and mark it
for i := 0; i < len(done); i++ {
if isEQ(inst, done[i]) {
print(ind, inst, ": -> ", inst.Next(), "...\n");
return;
}
}
slot := len(done);
done= done[0:slot+1];
done[slot] = inst;
if isNULL(inst) {
println("NULL");
return;
}
if inst.Type() == END { print(inst, ": END\n"); return }
print(ind, inst, ": ");
inst.Print("");
print(" -> ", inst.Next(), "\n");
switch inst.Type() {
case END:
return;
case ALT:
re.Dump(ind + "\t", inst.(*Alt).left, done);
}
re.Dump(ind, inst.Next(), done);
}
func (re *RE) DumpAll() {
re.Dump("", re.start, new([]Inst, 1000)[0:0]);
}
func (re *RE) DoParse() {
parser := NewParser(re);
start := new(Start);
s, e := parser.Regexp();
if isNULL(s) {
if !isNULL(e) { re.Error(ErrInternal) }
e = start;
re.start = start;
e.SetNext(new(End));
re.EliminateNops(re.start);
println();
re.Error(ErrUnimplemented);
}
func Compiler(str string, ch *chan *RE) {
re := new(RE);
re.expr = str;
re.ch = ch;
re.DoParse();
ch <- re;
}
// Public interface has only execute functionality (not yet implemented)
export type Regexp interface {
// Execute() bool
}
// compile in separate goroutine; wait for result
export func Compile(str string) (regexp Regexp, error *os.Error) {
ch := new(chan *RE);
go Compiler(str, ch);
re := <-ch;
return re, re.error
}