folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return...

folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return U+FFFD / throw for well-formed UTF-8 encoded values that are larger than the max code point Summary: UTF-8 can encode large numbers, but Unicode code points are only defined up to `U+10FFFF`. For example: - the 4B UTF-8 encoding `"\xF6\x8D\x9B\ xBC"` (bits: `11110110 10001101 10011011 10111100`) is a valid UTF-8 encoding - but the encoded value is `U+18D6 (https://github.com/facebook/folly/commit/d40182262d41679cab28f6be7366cc5ff901683b)FC` which is larger than `U+10FFFF` With `opts.skip_invalid_utf8 = true;` `json::serialize` should have returned `"\ufffd"` since it the input is invalid, but due to a bug in `utf8ToCodePoint` it returned the incorrect `"\"\xF6\x8D\x9B\xBC\""`. Update `utf8ToCodePoint` to also reject 4 byte UTF-8 encoded values larger than the max Unicode code point (`U+10FFFF`). Reviewed By: luciang Differential Revision: D25275722 fbshipit-source-id: e7daeea834a0c5323923a5451a2565ceff5e4734

folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return...
folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return U+FFFD / throw for well-formed UTF-8 encoded values that are larger than the max code point Summary: UTF-8 can encode large numbers, but Unicode code points are only defined up to `U+10FFFF`. For example: - the 4B UTF-8 encoding `"\xF6\x8D\x9B\ xBC"` (bits: `11110110 10001101 10011011 10111100`) is a valid UTF-8 encoding - but the encoded value is `U+18D6 (https://github.com/facebook/folly/commit/d40182262d41679cab28f6be7366cc5ff901683b)FC` which is larger than `U+10FFFF` With `opts.skip_invalid_utf8 = true;` `json::serialize` should have returned `"\ufffd"` since it the input is invalid, but due to a bug in `utf8ToCodePoint` it returned the incorrect `"\"\xF6\x8D\x9B\xBC\""`. Update `utf8ToCodePoint` to also reject 4 byte UTF-8 encoded values larger than the max Unicode code point (`U+10FFFF`). Reviewed By: luciang Differential Revision: D25275722 fbshipit-source-id: e7daeea834a0c5323923a5451a2565ceff5e4734
ae03ef83 · Xiayi Sun · Facebook GitHub Bot · 67f20f29 · ae03ef83 · ae03ef83
Commit ae03ef83 authored Dec 04, 2020 by Xiayi Sun Committed by Facebook GitHub Bot Dec 04, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 12 deletions

folly/Unicode.cpp folly/Unicode.cpp +43 -12

folly/test/JsonTest.cpp folly/test/JsonTest.cpp +8 -0

folly/test/UnicodeTest.cpp folly/test/UnicodeTest.cpp +4 -0

No files found.
--- a/folly/Unicode.cpp
+++ b/folly/Unicode.cpp
@@ -53,15 +53,30 @@ char32_t utf8ToCodePoint(
    const unsigned char*& p,
    const unsigned char* const e,
    bool skipOnError) {
-  /* The following encodings are valid, except for the 5 and 6 byte
-   * combinations:
-   * 0xxxxxxx
-   * 110xxxxx 10xxxxxx
-   * 1110xxxx 10xxxxxx 10xxxxxx
-   * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   */
+  // clang-format off
+  /** UTF encodings
+  *  | # of B | First CP |  Last CP  | Bit Pattern
+  *  |   1    |   0x0000 |   0x007F  | 0xxxxxxx
+  *  |   2    |   0x0080 |   0x07FF  | 110xxxxx 10xxxxxx
+  *  |   3    |   0x0800 |   0xFFFF  | 1110xxxx 10xxxxxx 10xxxxxx
+  *  |   4    |  0x10000 | 0x10FFFF  | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  *  |   5    |       -  |        -  | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+  *  |   6    |       -  |        -  | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+  *
+  *
+  * NOTE:
+  * - the 4B encoding can encode values up to 0x1FFFFF,
+  *   but Unicode defines 0x10FFFF to be the largest code point
+  * - the 5B & 6B encodings will all encode values larger than 0x1FFFFF
+  *   (so larger than the largest code point value 0x10FFFF) so they form invalid
+  *   unicode code points
+  *
+  * On invalid input (invalid encoding or code points larger than 0x10FFFF):
+  * - When skipOnError is true, this function will skip the first byte and return
+  *   U'\ufffd'. Potential optimization: skip the whole invalid range.
+  * - When skipOnError is false, throws.
+  */
+  // clang-format on

  const auto skip = [&] {
    ++p;
@@ -77,7 +92,7 @@ char32_t utf8ToCodePoint(

  unsigned char fst = *p;
  if (!(fst & 0x80)) {
-    // trivial case
+    // trivial case, 1 byte encoding
    return *p++;
  }

@@ -91,6 +106,7 @@ char32_t utf8ToCodePoint(
  // upper control bits are masked out later
  uint32_t d = fst;

+  // multi-byte encoded values must start with bits 0b11. 0xC0 is 0b11000000
  if ((fst & 0xC0) != 0xC0) {
    if (skipOnError) {
      return skip();
@@ -104,6 +120,7 @@ char32_t utf8ToCodePoint(
  for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
    const unsigned char tmp = p[i];

+    // from the second byte on, format should be 10xxxxxx
    if ((tmp & 0xC0) != 0x80) {
      if (skipOnError) {
        return skip();
@@ -112,11 +129,15 @@ char32_t utf8ToCodePoint(
          "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
    }

+    // gradually fill a 32 bit integer d with non control bits in tmp
+    // 0x3F is 0b00111111 which clears out the first 2 control bits
    d = (d << 6) | (tmp & 0x3F);
    fst <<= 1;

    if (!(fst & 0x80)) {
-      d &= bitMask[i];
+      // We know the length of encoding now, since we encounter the first "0" in
+      // fst (the first byte). This branch processes the last byte of encoding.
+      d &= bitMask[i]; // d is now the code point

      // overlong, could have been encoded with i bytes
      if ((d & ~bitMask[i - 1]) == 0) {
@@ -129,7 +150,7 @@ char32_t utf8ToCodePoint(

      // check for surrogates only needed for 3 bytes
      if (i == 2) {
-        if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
+        if (d >= 0xD800 && d <= 0xDFFF) {
          if (skipOnError) {
            return skip();
          }
@@ -138,6 +159,16 @@ char32_t utf8ToCodePoint(
        }
      }

+      // While UTF-8 encoding can encode arbitrary numbers, 0x10FFFF is the
+      // largest defined Unicode code point.
+      // Only >=4 bytes can UTF-8 encode such values, so i=3 here.
+      if (d > 0x10FFFF) {
+        if (skipOnError) {
+          return skip();
+        }
+        throw std::runtime_error(
+            "folly::utf8ToCodePoint encoding exceeds max unicode code point");
+      }
      p += i + 1;
      return d;
    }

--- a/folly/test/JsonTest.cpp
+++ b/folly/test/JsonTest.cpp
@@ -573,6 +573,8 @@ TEST(Json, UTF8Validation) {
  // test validate_utf8 with invalid utf8
  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
+  // not a valid unicode because it is larger than the max 0x10FFFF code-point
+  EXPECT_ANY_THROW(folly::json::serialize("\xF6\x8D\x9B\xBC", opts));

  opts.skip_invalid_utf8 = true;
  EXPECT_EQ(
@@ -584,6 +586,12 @@ TEST(Json, UTF8Validation) {
  EXPECT_EQ(
      folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts),
      u8"\"z\ufffd\ufffdz\xe0\xa0\x80\"");
+  EXPECT_EQ(
+      folly::json::serialize("\xF6\x8D\x9B\xBC", opts),
+      u8"\"\ufffd\ufffd\ufffd\ufffd\"");
+  EXPECT_EQ(
+      folly::json::serialize("invalid\xF6\x8D\x9B\xBCinbetween", opts),
+      u8"\"invalid\ufffd\ufffd\ufffd\ufffdinbetween\"");

  opts.encode_non_ascii = true;
  EXPECT_EQ(

--- a/folly/test/UnicodeTest.cpp
+++ b/folly/test/UnicodeTest.cpp
@@ -106,6 +106,10 @@ void testInvalid(std::initializer_list<unsigned char> data) {
  }
 }

+TEST(InvalidUtf8ToCodePoint, UnicodeOutOfRangeTest) {
+  testInvalid({0xF4, 0x90, 0x80, 0x80}); // u8"\U0010FFFF" + 1
+}
+
 TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
  // https://tools.ietf.org/html/rfc3629
  // Implementations of the decoding algorithm above MUST protect against