Commit 49fd581d authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook GitHub Bot

Extract utf-16 surrogate-pair helpers

Summary: [Folly] Extract utf-16 surrogate-pair helpers from `folly/json.cpp` into `folly/Unicode.h`.

Reviewed By: luciang

Differential Revision: D21634400

fbshipit-source-id: 64d15e79fe19cce5f5c9c38837d4663209b5f888
parent 44ff9b50
...@@ -18,10 +18,62 @@ ...@@ -18,10 +18,62 @@
#pragma once #pragma once
#include <cstdint>
#include <stdexcept>
#include <string> #include <string>
#include <folly/lang/Exception.h>
namespace folly { namespace folly {
class FOLLY_EXPORT unicode_error : std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
// Unicode code points are split into 17 planes.
//
// The Basic Multilingual Plane covers code points in [0-0xFFFF] but reserves
// two invalid ranges:
// - High surrogates: [0xD800-0xDBFF].
// - Low surrogates: [0xDC00-0xDFFF].
//
// UTF-16 code units are 2 bytes wide and are represented here with char16_t.
// Unicode code points are represented in UTF-16 across either 1-2 code units:
// - Valid BMP code points [0x0000-0xD7FF] + [0xE000-0xFFFF] are encoded
// directly as 1 code unit.
// - Code points larger than BMP (>0xFFFF) are encoded as 2 code units, with
// values respectively in the high surrogates and low surrogates ranges.
//
// JSON text permits the inclusion of Unicode escape sequences within quoted
// strings:
// - Valid BMP code points are encoded as \xXXXX, where XXXX are the base-16
// digits of the code point.
// - Code points larger than BMP are encoded as \uHHHH\uLLLL, where HHHH and
// LLLL are respectively the base-16 digits of the high and low surrogates of
// the UTF-16 encoding of the code point.
inline bool utf16_code_unit_is_bmp(char16_t const c) {
return c < 0xd800 || c >= 0xe000;
}
inline bool utf16_code_unit_is_high_surrogate(char16_t const c) {
return c >= 0xd800 && c < 0xdc00;
}
inline bool utf16_code_unit_is_low_surrogate(char16_t const c) {
return c >= 0xdc00 && c < 0xe000;
}
inline char32_t unicode_code_point_from_utf16_surrogate_pair(
char16_t const high,
char16_t const low) {
if (!utf16_code_unit_is_high_surrogate(high)) {
throw_exception<unicode_error>("invalid high surrogate");
}
if (!utf16_code_unit_is_low_surrogate(low)) {
throw_exception<unicode_error>("invalid low surrogate");
}
return 0x10000 + ((char32_t(high) & 0x3ff) << 10) + (char32_t(low) & 0x3ff);
}
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
/* /*
......
...@@ -558,24 +558,24 @@ std::string decodeUnicodeEscape(Input& in) { ...@@ -558,24 +558,24 @@ std::string decodeUnicodeEscape(Input& in) {
return ret; return ret;
}; };
/* // If the value encoded is in the surrogate pair range, we need to make
* If the value encoded is in the surrogate pair range, we need to // sure there is another escape that we can use also.
* make sure there is another escape that we can use also. //
*/ // See the explanation in folly/Unicode.h.
uint32_t codePoint = readHex(); uint16_t prefix = readHex();
if (codePoint >= 0xd800 && codePoint <= 0xdbff) { char32_t codePoint = prefix;
if (utf16_code_unit_is_high_surrogate(prefix)) {
if (!in.consume("\\u")) { if (!in.consume("\\u")) {
in.error( in.error(
"expected another unicode escape for second half of " "expected another unicode escape for second half of "
"surrogate pair"); "surrogate pair");
} }
uint16_t second = readHex(); uint16_t suffix = readHex();
if (second >= 0xdc00 && second <= 0xdfff) { if (!utf16_code_unit_is_low_surrogate(suffix)) {
codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff);
} else {
in.error("second character in surrogate pair is invalid"); in.error("second character in surrogate pair is invalid");
} }
} else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) { codePoint = unicode_code_point_from_utf16_surrogate_pair(prefix, suffix);
} else if (!utf16_code_unit_is_bmp(prefix)) {
in.error("invalid unicode code point (in range [0xdc00,0xdfff])"); in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
} }
......
...@@ -24,6 +24,53 @@ ...@@ -24,6 +24,53 @@
using folly::utf8ToCodePoint; using folly::utf8ToCodePoint;
class UnicodeTest : public testing::Test {};
TEST_F(UnicodeTest, utf16_code_unit_is_bmp) {
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0000));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0041));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xd7ff));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xd800));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdbff));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdc00));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdfff));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xe000));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xffff));
}
TEST_F(UnicodeTest, utf16_code_unit_is_high_surrogate) {
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0000));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0041));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xd7ff));
EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xd800));
EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xdbff));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdc00));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdfff));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xe000));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xffff));
}
TEST_F(UnicodeTest, utf16_code_unit_is_low_surrogate) {
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0000));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0041));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd7ff));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd800));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xdbff));
EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdc00));
EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdfff));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xe000));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xffff));
}
TEST_F(UnicodeTest, codePointCombineUtf16SurrogatePair) {
EXPECT_THROW(
folly::unicode_code_point_from_utf16_surrogate_pair(0x0041, 0x0041),
folly::unicode_error);
EXPECT_EQ(
0x1CC33,
folly::unicode_code_point_from_utf16_surrogate_pair(0xd833, 0xdc33));
}
void testValid(std::initializer_list<unsigned char> data, char32_t expected) { void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
{ {
const unsigned char* p = data.begin(); const unsigned char* p = data.begin();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment