regexp/syntax: recognize category aliases like \p{Letter}

The Unicode specification defines aliases for some of the general category names. For example the category "L" has alias "Letter". The regexp package supports \p{L} but not \p{Letter}, because there was nothing in the Unicode tables that lets regexp know about Letter. Now that package unicode provides CategoryAliases (see #70780), we can use it to provide \p{Letter} as well. This is the only feature missing from making package regexp suitable for use in a JSON-API Schema implementation. (The official test suite includes usage of aliases like \p{Letter} instead of \p{L}.) For better conformity with Unicode TR18, also accept case-insensitive matches for names and ignore underscores, hyphens, and spaces; and add Any, ASCII, and Assigned. Fixes #70781. Change-Id: I50ff024d99255338fa8d92663881acb47f1e92a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/641377 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com>

regexp/syntax: recognize category aliases like \p{Letter}
930cf59b · Russ Cox · 28fd9fa8 · 930cf59b · 930cf59b · 930cf59b
Commit 930cf59b authored 4 months ago by Russ Cox
--- a/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md
+++ b/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md
+The `\p{name}` and `\P{name}` character class syntaxes now accept the names
+Any, ASCII, Assigned, Cn, and LC, as well as Unicode category aliases like `\p{Letter}` for `\pL`.
+Following [Unicode TR18](https://unicode.org/reports/tr18/), they also now use
+case-insensitive name lookups, ignoring spaces, underscores, and hyphens.
--- a/src/regexp/syntax/doc.go
+++ b/src/regexp/syntax/doc.go
@@ -137,6 +137,7 @@ ASCII character classes:
 	[[:word:]]     word characters (== [0-9A-Za-z_])
 	[[:xdigit:]]   hex digit (== [0-9A-Fa-f])
-Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
+Unicode character classes are those in [unicode.Categories],
+[unicode.CategoryAliases], and [unicode.Scripts].
 */
 package syntax
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@@ -7,6 +7,7 @@ package syntax
 import (
 	"sort"
 	"strings"
+	"sync"
 	"unicode"
 	"unicode/utf8"
 )
@@ -1639,20 +1640,109 @@ var anyTable = &unicode.RangeTable{
 	R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
 }
+var asciiTable = &unicode.RangeTable{
+	R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
+}
+var asciiFoldTable = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		{Lo: 0, Hi: 0x7F, Stride: 1},
+		{Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
+		{Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
+	},
+}
+// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
+// but with the keys passed through canonicalName, to support inexact matches.
+var categoryAliases struct {
+	once sync.Once
+	m    map[string]string
+}
+// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
+func initCategoryAliases() {
+	categoryAliases.m = make(map[string]string)
+	for name, actual := range unicode.CategoryAliases {
+		categoryAliases.m[canonicalName(name)] = actual
+	}
+}
+// canonicalName returns the canonical lookup string for name.
+// The canonical name has a leading uppercase letter and then lowercase letters,
+// and it omits all underscores, spaces, and hyphens.
+// (We could have used all lowercase, but this way most package unicode
+// map keys are already canonical.)
+func canonicalName(name string) string {
+	var b []byte
+	first := true
+	for i := range len(name) {
+		c := name[i]
+		switch {
+		case c == '_' || c == '-' || c == ' ':
+			c = ' '
+		case first:
+			if 'a' <= c && c <= 'z' {
+				c -= 'a' - 'A'
+			}
+			first = false
+		default:
+			if 'A' <= c && c <= 'Z' {
+				c += 'a' - 'A'
+			}
+		}
+		if b == nil {
+			if c == name[i] && c != ' ' {
+				// No changes so far, avoid allocating b.
+				continue
+			}
+			b = make([]byte, i, len(name))
+			copy(b, name[:i])
+		}
+		if c == ' ' {
+			continue
+		}
+		b = append(b, c)
+	}
+	if b == nil {
+		return name
+	}
+	return string(b)
+}
 // unicodeTable returns the unicode.RangeTable identified by name
 // and the table of additional fold-equivalent code points.
-func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
+// If sign < 0, the result should be inverted.
-	// Special case: "Any" means any.
+func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
-	if name == "Any" {
+	name = canonicalName(name)
-		return anyTable, anyTable
+	// Special cases: Any, Assigned, and ASCII.
+	// Also LC is the only non-canonical Categories key, so handle it here.
+	switch name {
+	case "Any":
+		return anyTable, anyTable, +1
+	case "Assigned":
+		return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
+	case "Ascii":
+		return asciiTable, asciiFoldTable, +1
+	case "Lc":
+		return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
 	}
 	if t := unicode.Categories[name]; t != nil {
-		return t, unicode.FoldCategory[name]
+		return t, unicode.FoldCategory[name], +1
 	}
 	if t := unicode.Scripts[name]; t != nil {
-		return t, unicode.FoldScript[name]
+		return t, unicode.FoldScript[name], +1
+	}
+	// unicode.CategoryAliases makes liberal use of underscores in its names
+	// (they are defined that way by Unicode), but we want to match ignoring
+	// the underscores, so make our own map with canonical names.
+	categoryAliases.once.Do(initCategoryAliases)
+	if actual := categoryAliases.m[name]; actual != "" {
+		t := unicode.Categories[actual]
+		return t, unicode.FoldCategory[actual], +1
 	}
-	return nil, nil
+	return nil, nil, 0
 }
 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
@@ -1700,10 +1790,13 @@ func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string,
 		name = name[1:]
 	}
-	tab, fold := unicodeTable(name)
+	tab, fold, tsign := unicodeTable(name)
 	if tab == nil {
 		return nil, "", &Error{ErrInvalidCharRange, seq}
 	}
+	if tsign < 0 {
+		sign = -sign
+	}
 	if p.flags&FoldCase == 0 || fold == nil {
 		if sign > 0 {

--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@@ -107,10 +107,16 @@ var parseTests = []parseTest{
 	{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
 	{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
 	{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+	{`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)},
+	{`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)},
+	{`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)},
 	{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
 	{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
 	{`\p{Any}`, `dot{}`},
 	{`\p{^Any}`, `cc{}`},
+	{`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`},
+	{`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })},
+	{`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })},
 	// Hex, octal.
 	{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},