folly: unicode: support 4 byte unicode code points

Summary: Support 4-byte UTF-8 strings. https://en.wikipedia.org/wiki/UTF-8 | Number of bytes | Bits for code point | First code point | Last code point | Byte 1 | Byte 2 | Byte 3 | Byte 4 | 1 | 7 | U+0000 | U+007F | 0xxxxxxx | 2 | 11 | U+0080 | U+07FF | 110xxxxx | 10xxxxxx | 3 | 16 | U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | 4 | 21 | U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx `utf8ToCodePoint` now correctly returns the code point for 4-byte UTF-8 strings. The JSON standard (http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf) says: > Any code point may be represented as a hexadecimal escape sequence. The meaning of such a hexadecimal number is determined by ISO/IEC 10646. If the code point is in the Basic Multilingual Plane (U+0000 through U+FFFF), then it may be represented as a six-character sequence: a reverse solidus, followed by the lowercase letter u, followed by four hexadecimal digits that encode the code point. Hexadecimal digits can be This was the behavior implemented for 2-3-byte UTF-8 strings which all fit in the U+0000 through U+FFFF range (see above diagram). > To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an explicit surrogate pair is a semantic decision that is determined by the specific processor. When `encode_non_ascii == true` we now also support encoding 4-byte UTF-8 strings as 2 UTF-16 surrogate pairs. Reviewed By: ot Differential Revision: D9357479 fbshipit-source-id: 5c47bee4e71d5888b8264d8d3524d2084769adb0

folly: unicode: support 4 byte unicode code points
Summary: Support 4-byte UTF-8 strings. https://en.wikipedia.org/wiki/UTF-8 | Number of bytes | Bits for code point | First code point | Last code point | Byte 1 | Byte 2 | Byte 3 | Byte 4 | 1 | 7 | U+0000 | U+007F | 0xxxxxxx | 2 | 11 | U+0080 | U+07FF | 110xxxxx | 10xxxxxx | 3 | 16 | U+0800 | U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | 4 | 21 | U+10000 | U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx `utf8ToCodePoint` now correctly returns the code point for 4-byte UTF-8 strings. The JSON standard (http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf) says: > Any code point may be represented as a hexadecimal escape sequence. The meaning of such a hexadecimal number is determined by ISO/IEC 10646. If the code point is in the Basic Multilingual Plane (U+0000 through U+FFFF), then it may be represented as a six-character sequence: a reverse solidus, followed by the lowercase letter u, followed by four hexadecimal digits that encode the code point. Hexadecimal digits can be This was the behavior implemented for 2-3-byte UTF-8 strings which all fit in the U+0000 through U+FFFF range (see above diagram). > To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an explicit surrogate pair is a semantic decision that is determined by the specific processor. When `encode_non_ascii == true` we now also support encoding 4-byte UTF-8 strings as 2 UTF-16 surrogate pairs. Reviewed By: ot Differential Revision: D9357479 fbshipit-source-id: 5c47bee4e71d5888b8264d8d3524d2084769adb0
07a29570 · Lucian Grijincu · Facebook Github Bot · 8fdcdd4c · 07a29570 · 07a29570
Commit 07a29570 authored Aug 20, 2018 by Lucian Grijincu Committed by Facebook Github Bot Aug 20, 2018
5 changed files
--- a/folly/Unicode.cpp
+++ b/folly/Unicode.cpp
@@ -101,7 +101,7 @@ char32_t utf8ToCodePoint(
  fst <<= 1;
-  for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
+  for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
    const unsigned char tmp = p[i];
    if ((tmp & 0xC0) != 0x80) {

--- a/folly/json.cpp
+++ b/folly/json.cpp
@@ -718,13 +718,31 @@ void escapeString(
    if (opts.encode_non_ascii && (*p & 0x80)) {
      // note that this if condition captures utf8 chars
      // with value > 127, so size > 1 byte
-      char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
+      // NOTE: char32_t / char16_t are both unsigned.
+      char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
+      auto writeHex = [&](char16_t v) {
        char buf[] = "\\u\0\0\0\0";
-      buf[2] = hexDigit(uint8_t(v >> 12));
+        buf[2] = hexDigit((v >> 12) & 0x0f);
        buf[3] = hexDigit((v >> 8) & 0x0f);
        buf[4] = hexDigit((v >> 4) & 0x0f);
        buf[5] = hexDigit(v & 0x0f);
        out.append(buf, 6);
+      };
+      // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
+      if (cp < 0x10000u) {
+        // If the code point is in the Basic Multilingual Plane (U+0000 through
+        // U+FFFF), then it may be represented as a six-character sequence:
+        // a reverse solidus, followed by the lowercase letter u, followed by
+        // four hexadecimal digits that encode the code point.
+        writeHex(static_cast<char16_t>(cp));
+      } else {
+        // To escape a code point that is not in the Basic Multilingual Plane,
+        // the character may be represented as a twelve-character sequence,
+        // encoding the UTF-16 surrogate pair corresponding to the code point.
+        writeHex(static_cast<char16_t>(
+            0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
+        writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
+      }
    } else if (*p == '\\' || *p == '\"') {
      char buf[] = "\\\0";
      buf[1] = char(*p++);

--- a/folly/json.h
+++ b/folly/json.h
@@ -86,7 +86,9 @@ struct serialization_opts {
  // try to be minimally "pretty".
  bool pretty_formatting;
-  // If true, non-ASCII utf8 characters would be encoded as \uXXXX.
+  // If true, non-ASCII utf8 characters would be encoded as \uXXXX:
+  // - if the code point is in [U+0000..U+FFFF] => encode as a single \uXXXX
+  // - if the code point is > U+FFFF => encode as 2 UTF-16 surrogate pairs.
  bool encode_non_ascii;
  // Check that strings are valid utf8

--- a/folly/test/JsonTest.cpp
+++ b/folly/test/JsonTest.cpp
@@ -299,8 +299,13 @@ TEST(Json, JsonNonAsciiEncoding) {
  EXPECT_ANY_THROW(folly::json::serialize("\xc0\x80", opts));
  EXPECT_ANY_THROW(folly::json::serialize("\xe0\x80\x80", opts));
-  // Longer than 3 byte encodings
+  // Allow 4 byte encodings, escape using 2 UTF-16 surrogate pairs.
-  EXPECT_ANY_THROW(folly::json::serialize("\xf4\x8f\xbf\xbf", opts));
+  // "\xf0\x9f\x8d\x80" is Unicode Character 'FOUR LEAF CLOVER' (U+1F340)
+  // >>> json.dumps({"a": u"\U0001F340"})
+  // '{"a": "\\ud83c\\udf40"}'
+  EXPECT_EQ(
+      folly::json::serialize("\xf0\x9f\x8d\x80", opts), R"("\ud83c\udf40")");
+  // Longer than 4 byte encodings
  EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts));
 }

--- a/folly/test/UnicodeTest.cpp
+++ b/folly/test/UnicodeTest.cpp
+/*
+ * Copyright 2018-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Unicode.h>
+#include <initializer_list>
+#include <stdexcept>
+#include <folly/Range.h>
+#include <folly/portability/GTest.h>
+using folly::utf8ToCodePoint;
+void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
+  {
+    const unsigned char* p = data.begin();
+    const unsigned char* e = data.end();
+    EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ false), expected)
+        << folly::StringPiece(
+               (const char*)data.begin(), (const char*)data.end());
+  }
+  {
+    const unsigned char* p = data.begin();
+    const unsigned char* e = data.end();
+    EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), expected)
+        << folly::StringPiece(
+               (const char*)data.begin(), (const char*)data.end());
+  }
+}
+void testInvalid(std::initializer_list<unsigned char> data) {
+  {
+    const unsigned char* p = data.begin();
+    const unsigned char* e = data.end();
+    EXPECT_THROW(
+        utf8ToCodePoint(p, e, /* skipOnError */ false), std::runtime_error)
+        << folly::StringPiece(
+               (const char*)data.begin(), (const char*)data.end());
+  }
+  {
+    const unsigned char* p = data.begin();
+    const unsigned char* e = data.end();
+    EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), 0xfffd)
+        << folly::StringPiece(
+               (const char*)data.begin(), (const char*)data.end());
+  }
+}
+TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
+  // https://tools.ietf.org/html/rfc3629
+  // Implementations of the decoding algorithm above MUST protect against
+  // decoding invalid sequences.  For instance, a naive implementation may
+  // decode the overlong UTF-8 sequence C0 80 into the character U+0000 [...]
+  // Decoding invalid sequences may have security consequences or cause other
+  // problems.
+  testInvalid({0xC0, 0x80});
+}
+TEST(InvalidUtf8ToCodePoint, rfc3629SurrogatePair) {
+  // https://tools.ietf.org/html/rfc3629
+  // Implementations of the decoding algorithm above MUST protect against
+  // decoding invalid sequences.  For instance, a naive implementation may
+  // decode [...] the surrogate pair ED A1 8C ED BE B4 into U+233B4.
+  // Decoding invalid sequences may have security consequences or cause other
+  // problems.
+  testInvalid({0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4});
+}
+TEST(InvalidUtf8ToCodePoint, MarkusKuhnSingleUTF16Surrogates) {
+  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+  // 5.1.1  U+D800 = ed a0 80
+  // 5.1.2  U+DB7F = ed ad bf
+  // 5.1.3  U+DB80 = ed ae 80
+  // 5.1.4  U+DBFF = ed af bf
+  // 5.1.5  U+DC00 = ed b0 80
+  // 5.1.6  U+DF80 = ed be 80
+  // 5.1.7  U+DFFF = ed bf bf
+  testInvalid({0xed, 0xa0, 0x80});
+  testInvalid({0xed, 0xad, 0xbf});
+  testInvalid({0xed, 0xae, 0x80});
+  testInvalid({0xed, 0xaf, 0xbf});
+  testInvalid({0xed, 0xb0, 0x80});
+  testInvalid({0xed, 0xbe, 0x80});
+  testInvalid({0xed, 0xbf, 0xbf});
+}
+TEST(InvalidUtf8ToCodePoint, MarkusKuhnPairedUTF16Surrogates) {
+  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+  // 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80
+  // 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf
+  // 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80
+  // 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf
+  // 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80
+  // 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf
+  // 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80
+  // 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf
+  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80});
+  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf});
+  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80});
+  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf});
+  testInvalid({0xed, 0xae, 0x80, 0xed, 0xb0, 0x80});
+  testInvalid({0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf});
+  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80});
+  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf});
+}
+TEST(ValidUtf8ToCodePoint, FourCloverLeaf) {
+  testValid({0xF0, 0x9F, 0x8D, 0x80}, 0x1F340); // u8"\U0001F340";
+}
+TEST(InvalidUtf8ToCodePoint, FourCloverLeafAsSurrogates) {
+  testInvalid({0xd8, 0x3c, 0xdf, 0x40}); // u8"\U0001F340";
+}
+TEST(ValidUtf8ToCodePoint, LastCodePoint) {
+  testValid({0xF4, 0x8F, 0xBF, 0xBF}, 0x10FFFF); // u8"\U0010FFFF";
+}