Implement \u notation for strings and regexes.

This change adds the \u notation for double quoted strings and regular expressions. It does not implement the \u notation for character literals. Both the \uNNNN and \u{NNNN} notations are supported. \uNNNN is implemented by emitting equivalent UTF-8; that is, "\u4000" is equivalent to "\xE4\x80\x80". Unlike CRuby, the \u{NNNN} notation allows only one character per pair of braces; I see no way to lift this restriction without remodeling the parser.

Implement \u notation for strings and regexes.
This change adds the \u notation for double quoted strings and regular expressions. It does not implement the \u notation for character literals. Both the \uNNNN and \u{NNNN} notations are supported. \uNNNN is implemented by emitting equivalent UTF-8; that is, "\u4000" is equivalent to "\xE4\x80\x80". Unlike CRuby, the \u{NNNN} notation allows only one character per pair of braces; I see no way to lift this restriction without remodeling the parser.
509cbc51 · chasonr · b8d7f1ce · 509cbc51 · 509cbc51
Commit 509cbc51 authored Mar 23, 2014 by chasonr
Hide whitespace changes
Inline Side-by-side

Showing with 157 additions and 10 deletions

src/parse.y src/parse.y +97 -10

test/t/unicode.rb test/t/unicode.rb +60 -0

No files found.
--- a/src/parse.y
+++ b/src/parse.y
@@ -40,7 +40,7 @@ static void yyerror(parser_state *p, const char *s);
 static void yywarn(parser_state *p, const char *s);
 static void yywarning(parser_state *p, const char *s);
 static void backref_error(parser_state *p, node *n);
-static void tokadd(parser_state *p, int c);
+static void tokadd(parser_state *p, int32_t c);

 #ifndef isascii
 #define isascii(c) (((c) & ~0x7f) == 0)
@@ -3465,10 +3465,44 @@ newtok(parser_state *p)
 }

 static void
-tokadd(parser_state *p, int c)
+tokadd(parser_state *p, int32_t c)
 {
-  if (p->bidx < MRB_PARSER_BUF_SIZE) {
-    p->buf[p->bidx++] = c;
+  char utf8[4];
+  unsigned len;
+
+  /* mrb_assert(-0x10FFFF <= c && c <= 0xFF); */
+  if (c >= 0) {
+    /* Single byte from source or non-Unicode escape */
+    utf8[0] = (char)c;
+    len = 1;
+  } else {
+    /* Unicode character */
+    c = -c;
+    if (c < 0x80) {
+      utf8[0] = (char)c;
+      len = 1;
+    } else if (c < 0x800) {
+      utf8[0] = (char)(0xC0 | (c >> 6));
+      utf8[1] = (char)(0x80 | (c & 0x3F));
+      len = 2;
+    } else if (c < 0x10000) {
+      utf8[0] = (char)(0xE0 |  (c >> 12)        );
+      utf8[1] = (char)(0x80 | ((c >>  6) & 0x3F));
+      utf8[2] = (char)(0x80 | ( c        & 0x3F));
+      len = 3;
+    } else {
+      utf8[0] = (char)(0xF0 |  (c >> 18)        );
+      utf8[1] = (char)(0x80 | ((c >> 12) & 0x3F));
+      utf8[2] = (char)(0x80 | ((c >>  6) & 0x3F));
+      utf8[3] = (char)(0x80 | ( c        & 0x3F));
+      len = 4;
+    }
+  }
+  if (p->bidx+len <= MRB_PARSER_BUF_SIZE) {
+    unsigned i;
+    for (i = 0; i < len; i++) {
+      p->buf[p->bidx++] = utf8[i];
+    }
  }
 }

@@ -3522,15 +3556,15 @@ scan_oct(const int *start, int len, int *retlen)
  return retval;
 }

-static int
+static int32_t
 scan_hex(const int *start, int len, int *retlen)
 {
  static const char hexdigit[] = "0123456789abcdef0123456789ABCDEF";
  const int *s = start;
-  int retval = 0;
+  int32_t retval = 0;
  char *tmp;

-  /* mrb_assert(len <= 2) */
+  /* mrb_assert(len <= 8) */
  while (len-- && *s && (tmp = (char*)strchr(hexdigit, *s))) {
    retval <<= 4;
    retval |= (tmp - hexdigit) & 15;
@@ -3541,10 +3575,11 @@ scan_hex(const int *start, int len, int *retlen)
  return retval;
 }

-static int
+/* Return negative to indicate Unicode code point */
+static int32_t
 read_escape(parser_state *p)
 {
-  int c;
+  int32_t c;

  switch (c = nextc(p)) {
  case '\\':/* Backslash */
@@ -3611,6 +3646,53 @@ read_escape(parser_state *p)
  }
  return c;

+  case 'u':     /* Unicode */
+  {
+    int buf[9];
+    int i;
+
+    /* Look for opening brace */
+    i = 0;
+    buf[0] = nextc(p);
+    if (buf[0] < 0) goto eof;
+    if (buf[0] == '{') {
+      /* \u{xxxxxxxx} form */
+      for (i=0; i<9; i++) {
+        buf[i] = nextc(p);
+        if (buf[i] < 0) goto eof;
+        if (buf[i] == '}') {
+          break;
+        } else if (!ISXDIGIT(buf[i])) {
+          yyerror(p, "Invalid escape character syntax");
+          pushback(p, buf[i]);
+          return 0;
+        }
+      }
+    } else if (ISXDIGIT(buf[0])) {
+      /* \uxxxx form */
+      for (i=1; i<4; i++) {
+        buf[i] = nextc(p);
+        if (buf[i] < 0) goto eof;
+        if (!ISXDIGIT(buf[i])) {
+          pushback(p, buf[i]);
+          break;
+        }
+      }
+    } else {
+      pushback(p, buf[0]);
+    }
+    c = scan_hex(buf, i, &i);
+    if (i == 0) {
+      yyerror(p, "Invalid escape character syntax");
+      return 0;
+    }
+    if (c < 0 || c > 0x10FFFF || (c & 0xFFFFF800) == 0xD800) {
+      yyerror(p, "Invalid Unicode code point");
+      return 0;
+    }
+  }
+  return -c;
+
  case 'b':/* backspace */
    return '\010';

@@ -3726,9 +3808,14 @@ parse_string(parser_state *p)
        }
        else {
          if (type & STR_FUNC_REGEXP) {
+            if (c == 'u') {
+              pushback(p, c);
+              tokadd(p, read_escape(p));
+            } else {
            tokadd(p, '\\');
            if (c >= 0)
              tokadd(p, c);
+            }
          } else {
            pushback(p, c);
            tokadd(p, read_escape(p));
@@ -3932,7 +4019,7 @@ arg_ambiguous(parser_state *p)
 static int
 parser_yylex(parser_state *p)
 {
-  int c;
+  int32_t c;
  int space_seen = 0;
  int cmd_state;
  enum mrb_lex_state_enum last_state;

--- a/test/t/unicode.rb
+++ b/test/t/unicode.rb
+# Test of the \u notation
+
+assert('bare \u notation test') do
+  # Mininum and maximum one byte characters
+  assert_equal("\u0000", "\x00")
+  assert_equal("\u007F", "\x7F")
+
+  # Mininum and maximum two byte characters
+  assert_equal("\u0080", "\xC2\x80")
+  assert_equal("\u07FF", "\xDF\xBF")
+
+  # Mininum and maximum three byte characters
+  assert_equal("\u0800", "\xE0\xA0\x80")
+  assert_equal("\uFFFF", "\xEF\xBF\xBF")
+
+  # Four byte characters require the \U notation
+end
+
+assert('braced \u notation test') do
+  # Mininum and maximum one byte characters
+  assert_equal("\u{0000}", "\x00")
+  assert_equal("\u{007F}", "\x7F")
+
+  # Mininum and maximum two byte characters
+  assert_equal("\u{0080}", "\xC2\x80")
+  assert_equal("\u{07FF}", "\xDF\xBF")
+
+  # Mininum and maximum three byte characters
+  assert_equal("\u{0800}", "\xE0\xA0\x80")
+  assert_equal("\u{FFFF}", "\xEF\xBF\xBF")
+
+  # Mininum and maximum four byte characters
+  assert_equal("\u{10000}",  "\xF0\x90\x80\x80")
+  assert_equal("\u{10FFFF}", "\xF4\x8F\xBF\xBF")
+end
+
+# Test regular expressions only if implemented
+begin
+  Regexp
+  have_regexp = true
+rescue NameError
+  have_regexp = false
+end
+if have_regexp then
+  assert('Testing in regular expressions') do
+    # The regular expression uses the unbraced notation where the string uses
+    # the braced notation, and vice versa, so these tests will fail if the \u
+    # modification is not applied
+
+    # Test of unbraced \u notation in a regular expression
+    assert_false(/\u0300/ =~ "\u{02FF}")
+    assert_true( /\u0300/ =~ "\u{0300}")
+    assert_false(/\u0300/ =~ "\u{0301}")
+
+    # Test of braced \u notation in a regular expression
+    assert_false(/\u{0300}/ =~ "\u02FF")
+    assert_true( /\u{0300}/ =~ "\u0300")
+    assert_false(/\u{0300}/ =~ "\u0301")
+  end
+end