Extract utf-16 surrogate-pair helpers

Summary: [Folly] Extract utf-16 surrogate-pair helpers from `folly/json.cpp` into `folly/Unicode.h`. Reviewed By: luciang Differential Revision: D21634400 fbshipit-source-id: 64d15e79fe19cce5f5c9c38837d4663209b5f888

Extract utf-16 surrogate-pair helpers
Summary: [Folly] Extract utf-16 surrogate-pair helpers from `folly/json.cpp` into `folly/Unicode.h`. Reviewed By: luciang Differential Revision: D21634400 fbshipit-source-id: 64d15e79fe19cce5f5c9c38837d4663209b5f888
49fd581d · Yedidya Feldblum · Facebook GitHub Bot · 44ff9b50 · 49fd581d · 49fd581d
Commit 49fd581d authored May 23, 2020 by Yedidya Feldblum Committed by Facebook GitHub Bot May 23, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 11 deletions

folly/Unicode.h folly/Unicode.h +52 -0

folly/json.cpp folly/json.cpp +11 -11

folly/test/UnicodeTest.cpp folly/test/UnicodeTest.cpp +47 -0

No files found.
--- a/folly/Unicode.h
+++ b/folly/Unicode.h
@@ -18,10 +18,62 @@

 #pragma once

+#include <cstdint>
+#include <stdexcept>
 #include <string>

+#include <folly/lang/Exception.h>
+
 namespace folly {

+class FOLLY_EXPORT unicode_error : std::runtime_error {
+ public:
+  using std::runtime_error::runtime_error;
+};
+
+//  Unicode code points are split into 17 planes.
+//
+//  The Basic Multilingual Plane covers code points in [0-0xFFFF] but reserves
+//  two invalid ranges:
+//  - High surrogates: [0xD800-0xDBFF].
+//  - Low surrogates: [0xDC00-0xDFFF].
+//
+//  UTF-16 code units are 2 bytes wide and are represented here with char16_t.
+//  Unicode code points are represented in UTF-16 across either 1-2 code units:
+//  - Valid BMP code points [0x0000-0xD7FF] + [0xE000-0xFFFF] are encoded
+//    directly as 1 code unit.
+//  - Code points larger than BMP (>0xFFFF) are encoded as 2 code units, with
+//    values respectively in the high surrogates and low surrogates ranges.
+//
+//  JSON text permits the inclusion of Unicode escape sequences within quoted
+//  strings:
+//  - Valid BMP code points are encoded as \xXXXX, where XXXX are the base-16
+//    digits of the code point.
+//  - Code points larger than BMP are encoded as \uHHHH\uLLLL, where HHHH and
+//    LLLL are respectively the base-16 digits of the high and low surrogates of
+//    the UTF-16 encoding of the code point.
+
+inline bool utf16_code_unit_is_bmp(char16_t const c) {
+  return c < 0xd800 || c >= 0xe000;
+}
+inline bool utf16_code_unit_is_high_surrogate(char16_t const c) {
+  return c >= 0xd800 && c < 0xdc00;
+}
+inline bool utf16_code_unit_is_low_surrogate(char16_t const c) {
+  return c >= 0xdc00 && c < 0xe000;
+}
+inline char32_t unicode_code_point_from_utf16_surrogate_pair(
+    char16_t const high,
+    char16_t const low) {
+  if (!utf16_code_unit_is_high_surrogate(high)) {
+    throw_exception<unicode_error>("invalid high surrogate");
+  }
+  if (!utf16_code_unit_is_low_surrogate(low)) {
+    throw_exception<unicode_error>("invalid low surrogate");
+  }
+  return 0x10000 + ((char32_t(high) & 0x3ff) << 10) + (char32_t(low) & 0x3ff);
+}
+
 //////////////////////////////////////////////////////////////////////

 /*

--- a/folly/json.cpp
+++ b/folly/json.cpp
@@ -558,24 +558,24 @@ std::string decodeUnicodeEscape(Input& in) {
    return ret;
  };

-  /*
-   * If the value encoded is in the surrogate pair range, we need to
-   * make sure there is another escape that we can use also.
-   */
-  uint32_t codePoint = readHex();
-  if (codePoint >= 0xd800 && codePoint <= 0xdbff) {
+  //  If the value encoded is in the surrogate pair range, we need to make
+  //  sure there is another escape that we can use also.
+  //
+  //  See the explanation in folly/Unicode.h.
+  uint16_t prefix = readHex();
+  char32_t codePoint = prefix;
+  if (utf16_code_unit_is_high_surrogate(prefix)) {
    if (!in.consume("\\u")) {
      in.error(
          "expected another unicode escape for second half of "
          "surrogate pair");
    }
-    uint16_t second = readHex();
-    if (second >= 0xdc00 && second <= 0xdfff) {
-      codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff);
-    } else {
+    uint16_t suffix = readHex();
+    if (!utf16_code_unit_is_low_surrogate(suffix)) {
      in.error("second character in surrogate pair is invalid");
    }
-  } else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) {
+    codePoint = unicode_code_point_from_utf16_surrogate_pair(prefix, suffix);
+  } else if (!utf16_code_unit_is_bmp(prefix)) {
    in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
  }


--- a/folly/test/UnicodeTest.cpp
+++ b/folly/test/UnicodeTest.cpp
@@ -24,6 +24,53 @@

 using folly::utf8ToCodePoint;

+class UnicodeTest : public testing::Test {};
+
+TEST_F(UnicodeTest, utf16_code_unit_is_bmp) {
+  EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0000));
+  EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0041));
+  EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xd7ff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xd800));
+  EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdbff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdc00));
+  EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdfff));
+  EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xe000));
+  EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xffff));
+}
+
+TEST_F(UnicodeTest, utf16_code_unit_is_high_surrogate) {
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0000));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0041));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xd7ff));
+  EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xd800));
+  EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xdbff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdc00));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdfff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xe000));
+  EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xffff));
+}
+
+TEST_F(UnicodeTest, utf16_code_unit_is_low_surrogate) {
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0000));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0041));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd7ff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd800));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xdbff));
+  EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdc00));
+  EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdfff));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xe000));
+  EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xffff));
+}
+
+TEST_F(UnicodeTest, codePointCombineUtf16SurrogatePair) {
+  EXPECT_THROW(
+      folly::unicode_code_point_from_utf16_surrogate_pair(0x0041, 0x0041),
+      folly::unicode_error);
+  EXPECT_EQ(
+      0x1CC33,
+      folly::unicode_code_point_from_utf16_surrogate_pair(0xd833, 0xdc33));
+}
+
 void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
  {
    const unsigned char* p = data.begin();