diff --git a/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md b/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md new file mode 100644 index 0000000000000000000000000000000000000000..63794b4671bd4489dd259c422fabfb6a261bd1d7 --- /dev/null +++ b/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md @@ -0,0 +1,4 @@ +The `\p{name}` and `\P{name}` character class syntaxes now accept the names +Any, ASCII, Assigned, Cn, and LC, as well as Unicode category aliases like `\p{Letter}` for `\pL`. +Following [Unicode TR18](https://unicode.org/reports/tr18/), they also now use +case-insensitive name lookups, ignoring spaces, underscores, and hyphens. diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go index 877f1043ddda8b4ead01122d7e9131ceb92fa98c..8a7d9992a2cfb8d8a99e7390062455f26019064c 100644 --- a/src/regexp/syntax/doc.go +++ b/src/regexp/syntax/doc.go @@ -137,6 +137,7 @@ ASCII character classes: [[:word:]] word characters (== [0-9A-Za-z_]) [[:xdigit:]] hex digit (== [0-9A-Fa-f]) -Unicode character classes are those in [unicode.Categories] and [unicode.Scripts]. +Unicode character classes are those in [unicode.Categories], +[unicode.CategoryAliases], and [unicode.Scripts]. */ package syntax diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index ed239dafdf34710d5be12dcfd8d401212bf6d8c3..b77a7dab8e2e6028e2a220addf227e4dcb31f716 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -7,6 +7,7 @@ package syntax import ( "sort" "strings" + "sync" "unicode" "unicode/utf8" ) @@ -1639,20 +1640,109 @@ var anyTable = &unicode.RangeTable{ R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, } +var asciiTable = &unicode.RangeTable{ + R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}}, +} + +var asciiFoldTable = &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 0, Hi: 0x7F, Stride: 1}, + {Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s. + {Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k. + }, +} + +// categoryAliases is a lazily constructed copy of unicode.CategoryAliases +// but with the keys passed through canonicalName, to support inexact matches. +var categoryAliases struct { + once sync.Once + m map[string]string +} + +// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases. +func initCategoryAliases() { + categoryAliases.m = make(map[string]string) + for name, actual := range unicode.CategoryAliases { + categoryAliases.m[canonicalName(name)] = actual + } +} + +// canonicalName returns the canonical lookup string for name. +// The canonical name has a leading uppercase letter and then lowercase letters, +// and it omits all underscores, spaces, and hyphens. +// (We could have used all lowercase, but this way most package unicode +// map keys are already canonical.) +func canonicalName(name string) string { + var b []byte + first := true + for i := range len(name) { + c := name[i] + switch { + case c == '_' || c == '-' || c == ' ': + c = ' ' + case first: + if 'a' <= c && c <= 'z' { + c -= 'a' - 'A' + } + first = false + default: + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + } + if b == nil { + if c == name[i] && c != ' ' { + // No changes so far, avoid allocating b. + continue + } + b = make([]byte, i, len(name)) + copy(b, name[:i]) + } + if c == ' ' { + continue + } + b = append(b, c) + } + if b == nil { + return name + } + return string(b) +} + // unicodeTable returns the unicode.RangeTable identified by name // and the table of additional fold-equivalent code points. -func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { - // Special case: "Any" means any. - if name == "Any" { - return anyTable, anyTable +// If sign < 0, the result should be inverted. +func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) { + name = canonicalName(name) + + // Special cases: Any, Assigned, and ASCII. + // Also LC is the only non-canonical Categories key, so handle it here. + switch name { + case "Any": + return anyTable, anyTable, +1 + case "Assigned": + return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned) + case "Ascii": + return asciiTable, asciiFoldTable, +1 + case "Lc": + return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1 } if t := unicode.Categories[name]; t != nil { - return t, unicode.FoldCategory[name] + return t, unicode.FoldCategory[name], +1 } if t := unicode.Scripts[name]; t != nil { - return t, unicode.FoldScript[name] + return t, unicode.FoldScript[name], +1 + } + + // unicode.CategoryAliases makes liberal use of underscores in its names + // (they are defined that way by Unicode), but we want to match ignoring + // the underscores, so make our own map with canonical names. + categoryAliases.once.Do(initCategoryAliases) + if actual := categoryAliases.m[name]; actual != "" { + t := unicode.Categories[actual] + return t, unicode.FoldCategory[actual], +1 } - return nil, nil + return nil, nil, 0 } // parseUnicodeClass parses a leading Unicode character class like \p{Han} @@ -1700,10 +1790,13 @@ func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, name = name[1:] } - tab, fold := unicodeTable(name) + tab, fold, tsign := unicodeTable(name) if tab == nil { return nil, "", &Error{ErrInvalidCharRange, seq} } + if tsign < 0 { + sign = -sign + } if p.flags&FoldCase == 0 || fold == nil { if sign > 0 { diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index 0f885bd5c8149fa4b4be69a8163c0a618234bb60..9d2f698e258d198aa5a898252ee3efd76b851c8e 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -107,10 +107,16 @@ var parseTests = []parseTest{ {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)}, + {`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)}, + {`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)}, {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, {`\p{Any}`, `dot{}`}, {`\p{^Any}`, `cc{}`}, + {`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`}, + {`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })}, + {`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })}, // Hex, octal. {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},