Newer
Older
if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
// Numbers were too big, or max is present and min > max.
return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
repeat = before
t = after
if p.flags&PerlX != 0 && len(t) >= 2 {
switch t[1] {
case 'A':
p.op(OpBeginText)
t = t[2:]
break BigSwitch
case 'b':
p.op(OpWordBoundary)
t = t[2:]
break BigSwitch
case 'B':
p.op(OpNoWordBoundary)
t = t[2:]
break BigSwitch
case 'C':
// any byte; not supported
return nil, &Error{ErrInvalidEscape, t[:2]}
case 'Q':
// \Q ... \E: the ... is always literals
var lit string
for lit != "" {
c, rest, err := nextRune(lit)
if err != nil {
return nil, err
}
p.literal(c)
lit = rest
}
break BigSwitch
case 'z':
p.op(OpEndText)
t = t[2:]
break BigSwitch
}
}
re := p.newRegexp(OpCharClass)
re.Flags = p.flags
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
// Look for Unicode character group like \p{Han}
if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
if err != nil {
return nil, err
}
if r != nil {
re.Rune = r
t = rest
p.push(re)
break BigSwitch
}
}
// Perl character class escape.
if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
re.Rune = r
t = rest
p.push(re)
break BigSwitch
}
// Ordinary single-character escape.
if c, t, err = p.parseEscape(t); err != nil {
return nil, err
}
p.literal(c)
}
p.concat()
if p.swapVerticalBar() {
// pop vertical bar
p.stack = p.stack[:len(p.stack)-1]
}
p.alternate()
n := len(p.stack)
if n != 1 {
return nil, &Error{ErrMissingParen, s}
}
return p.stack[0], nil
}
// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
// If s is not of that form, it returns ok == false.
// If s has the right form but the values are too big, it returns min == -1, ok == true.
func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
if s == "" || s[0] != '{' {
return
}
s = s[1:]
var ok1 bool
if min, s, ok1 = p.parseInt(s); !ok1 {
return
}
if s == "" {
return
}
if s[0] != ',' {
max = min
} else {
s = s[1:]
if s == "" {
return
}
if s[0] == '}' {
max = -1
} else if max, s, ok1 = p.parseInt(s); !ok1 {
} else if max < 0 {
// parseInt found too big a number
min = -1
}
}
if s == "" || s[0] != '}' {
return
}
rest = s[1:]
ok = true
return
}
// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
// The caller must have ensured that s begins with "(?".
func (p *parser) parsePerlFlags(s string) (rest string, err error) {
t := s
// Check for named captures, first introduced in Python's regexp library.
// As usual, there are three slightly different syntaxes:
//
// (?P<name>expr) the original, introduced by Python
// (?<name>expr) the .NET alteration, adopted by Perl 5.10
// (?'name'expr) another .NET alteration, adopted by Perl 5.10
//
// Perl 5.10 gave in and implemented the Python version too,
// but they claim that the last two are the preferred forms.
// PCRE and languages based on it (specifically, PHP and Ruby)
// support all three as well. EcmaScript 4 uses only the Python form.
//
// In both the open source world (via Code Search) and the
// Google source tree, (?P<expr>name) and (?<expr>name) are the
// dominant forms of named captures and both are supported.
startsWithP := len(t) > 4 && t[2] == 'P' && t[3] == '<'
startsWithName := len(t) > 3 && t[2] == '<'
if startsWithP || startsWithName {
// position of expr start
exprStartPos := 4
if startsWithName {
exprStartPos = 3
}
// Pull out name.
end := strings.IndexRune(t, '>')
if end < 0 {
if err = checkUTF8(t); err != nil {
return "", err
}
return "", &Error{ErrInvalidNamedCapture, s}
}
capture := t[:end+1] // "(?P<name>" or "(?<name>"
name := t[exprStartPos:end] // "name"
if err = checkUTF8(name); err != nil {
return "", err
}
if !isValidCaptureName(name) {
return "", &Error{ErrInvalidNamedCapture, capture}
}
// Like ordinary capture, but named.
p.numCap++
re := p.op(opLeftParen)
re.Cap = p.numCap
re.Name = name
return t[end+1:], nil
}
// Non-capturing group. Might also twiddle Perl flags.
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
t = t[2:] // skip (?
flags := p.flags
sign := +1
sawFlag := false
Loop:
for t != "" {
if c, t, err = nextRune(t); err != nil {
return "", err
}
switch c {
default:
break Loop
// Flags.
case 'i':
flags |= FoldCase
sawFlag = true
case 'm':
flags &^= OneLine
sawFlag = true
case 's':
flags |= DotNL
sawFlag = true
case 'U':
flags |= NonGreedy
sawFlag = true
// Switch to negation.
case '-':
if sign < 0 {
break Loop
}
sign = -1
// Invert flags so that | above turn into &^ and vice versa.
// We'll invert flags again before using it below.
flags = ^flags
sawFlag = false
// End of flags, starting group or not.
case ':', ')':
if sign < 0 {
if !sawFlag {
break Loop
}
flags = ^flags
}
if c == ':' {
// Open new group
p.op(opLeftParen)
}
p.flags = flags
return t, nil
}
}
return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
}
// isValidCaptureName reports whether name
// is a valid capture name: [A-Za-z0-9_]+.
// PCRE limits names to 32 bytes.
// Python rejects names starting with digits.
// We don't enforce either of those.
func isValidCaptureName(name string) bool {
if name == "" {
return false
}
for _, c := range name {
if c != '_' && !isalnum(c) {
return false
}
}
return true
}
// parseInt parses a decimal integer.
func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
if s == "" || s[0] < '0' || '9' < s[0] {
return
}
// Disallow leading zeros.
if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
return
}
for s != "" && '0' <= s[0] && s[0] <= '9' {
s = s[1:]
}
rest = s
ok = true
// Have digits, compute value.
t = t[:len(t)-len(s)]
for i := 0; i < len(t); i++ {
// Avoid overflow.
if n >= 1e8 {
n = -1
break
}
n = n*10 + int(t[i]) - '0'
}
// can this be represented as a character class?
// single-rune literal string, char class, ., and .|\n.
func isCharClass(re *Regexp) bool {
return re.Op == OpLiteral && len(re.Rune) == 1 ||
re.Op == OpCharClass ||
re.Op == OpAnyCharNotNL ||
re.Op == OpAnyChar
}
// does re match r?
switch re.Op {
case OpLiteral:
return len(re.Rune) == 1 && re.Rune[0] == r
case OpCharClass:
for i := 0; i < len(re.Rune); i += 2 {
if re.Rune[i] <= r && r <= re.Rune[i+1] {
return true
}
}
return false
case OpAnyCharNotNL:
return r != '\n'
case OpAnyChar:
return true
}
return false
}
// parseVerticalBar handles a | in the input.
func (p *parser) parseVerticalBar() {
p.concat()
// The concatenation we just parsed is on top of the stack.
// If it sits above an opVerticalBar, swap it below
// (things below an opVerticalBar become an alternation).
// Otherwise, push a new vertical bar.
if !p.swapVerticalBar() {
p.op(opVerticalBar)
}
}
// mergeCharClass makes dst = dst|src.
// The caller must ensure that dst.Op >= src.Op,
// to reduce the amount of copying.
func mergeCharClass(dst, src *Regexp) {
switch dst.Op {
case OpAnyChar:
// src doesn't add anything.
case OpAnyCharNotNL:
// src might add \n
if matchRune(src, '\n') {
dst.Op = OpAnyChar
}
case OpCharClass:
// src is simpler, so either literal or char class
if src.Op == OpLiteral {
dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
} else {
dst.Rune = appendClass(dst.Rune, src.Rune)
}
case OpLiteral:
// both literal
if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
break
}
dst.Op = OpCharClass
dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
// If the top of the stack is an element followed by an opVerticalBar
// swapVerticalBar swaps the two and returns true.
// Otherwise it returns false.
func (p *parser) swapVerticalBar() bool {
// If above and below vertical bar are literal or char class,
// can merge into a single char class.
n := len(p.stack)
if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
re1 := p.stack[n-1]
re3 := p.stack[n-3]
// Make re3 the more complex of the two.
if re1.Op > re3.Op {
re1, re3 = re3, re1
p.stack[n-3] = re3
}
p.reuse(re1)
p.stack = p.stack[:n-1]
return true
}
if n >= 2 {
re1 := p.stack[n-1]
re2 := p.stack[n-2]
if re2.Op == opVerticalBar {
if n >= 3 {
// Now out of reach.
// Clean opportunistically.
cleanAlt(p.stack[n-3])
}
p.stack[n-2] = re1
p.stack[n-1] = re2
return true
}
}
return false
}
// parseRightParen handles a ) in the input.
func (p *parser) parseRightParen() error {
p.concat()
if p.swapVerticalBar() {
// pop vertical bar
p.stack = p.stack[:len(p.stack)-1]
}
p.alternate()
n := len(p.stack)
if n < 2 {
return &Error{ErrUnexpectedParen, p.wholeRegexp}
}
re1 := p.stack[n-1]
re2 := p.stack[n-2]
p.stack = p.stack[:n-2]
if re2.Op != opLeftParen {
return &Error{ErrUnexpectedParen, p.wholeRegexp}
// Restore flags at time of paren.
p.flags = re2.Flags
if re2.Cap == 0 {
// Just for grouping.
p.push(re1)
} else {
re2.Op = OpCapture
re2.Sub = re2.Sub0[:1]
re2.Sub[0] = re1
p.push(re2)
}
return nil
}
// parseEscape parses an escape sequence at the beginning of s
// and returns the rune.
func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
t := s[1:]
if t == "" {
return 0, "", &Error{ErrTrailingBackslash, ""}
}
c, t, err := nextRune(t)
if err != nil {
return 0, "", err
}
Switch:
switch c {
default:
if c < utf8.RuneSelf && !isalnum(c) {
// Escaped non-word characters are always themselves.
// PCRE is not quite so rigorous: it accepts things like
// \q, but we don't. We once rejected \_, but too many
// programs and people insist on using it, so allow \_.
return c, t, nil
}
// Octal escapes.
case '1', '2', '3', '4', '5', '6', '7':
// Single non-zero digit is a backreference; not supported
if t == "" || t[0] < '0' || t[0] > '7' {
break
}
fallthrough
case '0':
// Consume up to three octal digits; already have one.
r = c - '0'
for i := 1; i < 3; i++ {
if t == "" || t[0] < '0' || t[0] > '7' {
break
}
t = t[1:]
}
return r, t, nil
// Hexadecimal escapes.
case 'x':
if t == "" {
break
}
if c, t, err = nextRune(t); err != nil {
return 0, "", err
}
if c == '{' {
// Any number of digits in braces.
// Perl accepts any text at all; it ignores all text
// after the first non-hex digit. We require only hex digits,
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
// and at least one.
nhex := 0
r = 0
for {
if t == "" {
break Switch
}
if c, t, err = nextRune(t); err != nil {
return 0, "", err
}
if c == '}' {
break
}
v := unhex(c)
if v < 0 {
break Switch
}
r = r*16 + v
if r > unicode.MaxRune {
break Switch
}
}
if nhex == 0 {
break Switch
}
return r, t, nil
}
// Easy case: two hex digits.
x := unhex(c)
if c, t, err = nextRune(t); err != nil {
return 0, "", err
}
y := unhex(c)
if x < 0 || y < 0 {
break
}
return x*16 + y, t, nil
// C escapes. There is no case 'b', to avoid misparsing
// the Perl word-boundary \b as the C backspace \b
// when in POSIX mode. In Perl, /\b/ means word-boundary
// but /[\b]/ means backspace. We don't support that.
// If you want a backspace, embed a literal backspace
// character or use \x08.
case 'a':
return '\a', t, err
case 'f':
return '\f', t, err
case 'n':
return '\n', t, err
case 'r':
return '\r', t, err
case 't':
return '\t', t, err
case 'v':
return '\v', t, err
}
return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
}
// parseClassChar parses a character class character at the beginning of s
// and returns it.
func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
if s == "" {
return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
}
// Allow regular escape sequences even though
// many need not be escaped in this context.
if s[0] == '\\' {
return p.parseEscape(s)
}
return nextRune(s)
}
type charGroup struct {
sign int
//go:generate perl make_perl_groups.pl perl_groups.go
// parsePerlClassEscape parses a leading Perl character class escape like \d
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
return
}
g := perlGroup[s[0:2]]
if g.sign == 0 {
return
}
return p.appendGroup(r, g), s[2:]
}
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
if len(s) < 2 || s[0] != '[' || s[1] != ':' {
return
}
i := strings.Index(s[2:], ":]")
if i < 0 {
return
}
i += 2
name, s := s[0:i+2], s[i+2:]
g := posixGroup[name]
if g.sign == 0 {
return nil, "", &Error{ErrInvalidCharRange, name}
}
return p.appendGroup(r, g), s, nil
}
func (p *parser) appendGroup(r []rune, g charGroup) []rune {
if p.flags&FoldCase == 0 {
if g.sign < 0 {
r = appendNegatedClass(r, g.class)
} else {
r = appendClass(r, g.class)
}
tmp := p.tmpClass[:0]
tmp = appendFoldedClass(tmp, g.class)
p.tmpClass = tmp
tmp = cleanClass(&p.tmpClass)
if g.sign < 0 {
r = appendNegatedClass(r, tmp)
} else {
r = appendClass(r, tmp)
}
var anyTable = &unicode.RangeTable{
R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
var asciiTable = &unicode.RangeTable{
R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
}
var asciiFoldTable = &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 0, Hi: 0x7F, Stride: 1},
{Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
{Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
},
}
// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
// but with the keys passed through canonicalName, to support inexact matches.
var categoryAliases struct {
once sync.Once
m map[string]string
}
// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
func initCategoryAliases() {
categoryAliases.m = make(map[string]string)
for name, actual := range unicode.CategoryAliases {
categoryAliases.m[canonicalName(name)] = actual
}
}
// canonicalName returns the canonical lookup string for name.
// The canonical name has a leading uppercase letter and then lowercase letters,
// and it omits all underscores, spaces, and hyphens.
// (We could have used all lowercase, but this way most package unicode
// map keys are already canonical.)
func canonicalName(name string) string {
var b []byte
first := true
for i := range len(name) {
c := name[i]
switch {
case c == '_' || c == '-' || c == ' ':
c = ' '
case first:
if 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
first = false
default:
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
}
if b == nil {
if c == name[i] && c != ' ' {
// No changes so far, avoid allocating b.
continue
}
b = make([]byte, i, len(name))
copy(b, name[:i])
}
if c == ' ' {
continue
}
b = append(b, c)
}
if b == nil {
return name
}
return string(b)
}
// unicodeTable returns the unicode.RangeTable identified by name
// and the table of additional fold-equivalent code points.
// If sign < 0, the result should be inverted.
func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
name = canonicalName(name)
// Special cases: Any, Assigned, and ASCII.
// Also LC is the only non-canonical Categories key, so handle it here.
switch name {
case "Any":
return anyTable, anyTable, +1
case "Assigned":
return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
case "Ascii":
return asciiTable, asciiFoldTable, +1
case "Lc":
return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
if t := unicode.Categories[name]; t != nil {
return t, unicode.FoldCategory[name], +1
}
if t := unicode.Scripts[name]; t != nil {
return t, unicode.FoldScript[name], +1
}
// unicode.CategoryAliases makes liberal use of underscores in its names
// (they are defined that way by Unicode), but we want to match ignoring
// the underscores, so make our own map with canonical names.
categoryAliases.once.Do(initCategoryAliases)
if actual := categoryAliases.m[name]; actual != "" {
t := unicode.Categories[actual]
return t, unicode.FoldCategory[actual], +1
return nil, nil, 0
}
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
return
}
// Committed to parse or return error.
sign := +1
if s[1] == 'P' {
sign = -1
}
t := s[2:]
c, t, err := nextRune(t)
if err != nil {
return
}
var seq, name string
if c != '{' {
// Single-letter name.
seq = s[:len(s)-len(t)]
name = seq[2:]
} else {
// Name is in braces.
end := strings.IndexRune(s, '}')
if end < 0 {
if err = checkUTF8(s); err != nil {
return
}
return nil, "", &Error{ErrInvalidCharRange, s}
}
seq, t = s[:end+1], s[end+1:]
name = s[3:end]
if err = checkUTF8(name); err != nil {
return
}
}
// Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
if name != "" && name[0] == '^' {
sign = -sign
name = name[1:]
}
tab, fold, tsign := unicodeTable(name)
if tab == nil {
return nil, "", &Error{ErrInvalidCharRange, seq}
}
if tsign < 0 {
sign = -sign
}
if p.flags&FoldCase == 0 || fold == nil {
if sign > 0 {
r = appendTable(r, tab)
} else {
r = appendNegatedTable(r, tab)
}
// Merge and clean tab and fold in a temporary buffer.
// This is necessary for the negative case and just tidy
// for the positive case.
tmp := p.tmpClass[:0]
tmp = appendTable(tmp, tab)
tmp = appendTable(tmp, fold)
p.tmpClass = tmp
tmp = cleanClass(&p.tmpClass)
if sign > 0 {
r = appendClass(r, tmp)
} else {
r = appendNegatedClass(r, tmp)
}
}
return r, t, nil
}
// parseClass parses a character class at the beginning of s
// and pushes it onto the parse stack.
func (p *parser) parseClass(s string) (rest string, err error) {
re := p.newRegexp(OpCharClass)
re.Flags = p.flags
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
re.Rune = re.Rune0[:0]
sign := +1
if t != "" && t[0] == '^' {
sign = -1
t = t[1:]
// If character class does not match \n, add it here,
// so that negation later will do the right thing.
if p.flags&ClassNL == 0 {
re.Rune = append(re.Rune, '\n', '\n')
}
}
class := re.Rune
first := true // ] and - are okay as first char in class
for t == "" || t[0] != ']' || first {
// POSIX: - is only okay unescaped as first or last in class.
// Perl: - is okay anywhere.
if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
_, size := utf8.DecodeRuneInString(t[1:])
return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
}
first = false
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
// Look for POSIX [:alnum:] etc.
if len(t) > 2 && t[0] == '[' && t[1] == ':' {
nclass, nt, err := p.parseNamedClass(t, class)
if err != nil {
return "", err
}
if nclass != nil {
class, t = nclass, nt
continue
}
}
// Look for Unicode character group like \p{Han}.
nclass, nt, err := p.parseUnicodeClass(t, class)
if err != nil {
return "", err
}
if nclass != nil {
class, t = nclass, nt
continue
}
// Look for Perl character class symbols (extension).
if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
class, t = nclass, nt
continue
}
// Single character or simple range.
rng := t
if lo, t, err = p.parseClassChar(t, s); err != nil {
return "", err
}
hi = lo
// [a-] means (a|-) so check for final ].
if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
t = t[1:]
if hi, t, err = p.parseClassChar(t, s); err != nil {
return "", err
}
if hi < lo {
rng = rng[:len(rng)-len(t)]
return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
}
}
if p.flags&FoldCase == 0 {
class = appendRange(class, lo, hi)
} else {
class = appendFoldedRange(class, lo, hi)
}
}
t = t[1:] // chop ]
// Use &re.Rune instead of &class to avoid allocation.
re.Rune = class
class = cleanClass(&re.Rune)
if sign < 0 {
class = negateClass(class)
}
re.Rune = class
p.push(re)
return t, nil
}
// cleanClass sorts the ranges (pairs of elements of r),
// merges them, and eliminates duplicates.
// Sort by lo increasing, hi decreasing to break ties.
sort.Sort(ranges{rp})
r := *rp
if len(r) < 2 {
return r
}
// Merge abutting, overlapping.
w := 2 // write index
for i := 2; i < len(r); i += 2 {
lo, hi := r[i], r[i+1]
if lo <= r[w-1]+1 {
// merge with previous range
if hi > r[w-1] {
r[w-1] = hi
}
continue
}
// new disjoint range
r[w] = lo
r[w+1] = hi
w += 2
}
return r[:w]
}
// inCharClass reports whether r is in the class.
// It assumes the class has been cleaned by cleanClass.
func inCharClass(r rune, class []rune) bool {
_, ok := sort.Find(len(class)/2, func(i int) int {
lo, hi := class[2*i], class[2*i+1]
if r > hi {
return +1
}
if r < lo {
return -1
}
return 0
})
return ok
}
// appendLiteral returns the result of appending the literal x to the class r.
if flags&FoldCase != 0 {
return appendFoldedRange(r, x, x)
}
return appendRange(r, x, x)
}
// appendRange returns the result of appending the range lo-hi to the class r.
// Expand last range or next to last range if it overlaps or abuts.
// Checking two ranges helps when appending case-folded
// alphabets, so that one range can be expanding A-Z and the
// other expanding a-z.
n := len(r)
for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
if n >= i {
rlo, rhi := r[n-i], r[n-i+1]
if lo <= rhi+1 && rlo <= hi+1 {
if lo < rlo {
r[n-i] = lo
}
if hi > rhi {
r[n-i+1] = hi
}
return r