Commit 49fd581d authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by Facebook GitHub Bot

Extract utf-16 surrogate-pair helpers

Summary: [Folly] Extract utf-16 surrogate-pair helpers from `folly/json.cpp` into `folly/Unicode.h`.

Reviewed By: luciang

Differential Revision: D21634400

fbshipit-source-id: 64d15e79fe19cce5f5c9c38837d4663209b5f888
parent 44ff9b50
......@@ -18,10 +18,62 @@
#pragma once
#include <cstdint>
#include <stdexcept>
#include <string>
#include <folly/lang/Exception.h>
namespace folly {
class FOLLY_EXPORT unicode_error : std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
// Unicode code points are split into 17 planes.
//
// The Basic Multilingual Plane covers code points in [0-0xFFFF] but reserves
// two invalid ranges:
// - High surrogates: [0xD800-0xDBFF].
// - Low surrogates: [0xDC00-0xDFFF].
//
// UTF-16 code units are 2 bytes wide and are represented here with char16_t.
// Unicode code points are represented in UTF-16 across either 1-2 code units:
// - Valid BMP code points [0x0000-0xD7FF] + [0xE000-0xFFFF] are encoded
// directly as 1 code unit.
// - Code points larger than BMP (>0xFFFF) are encoded as 2 code units, with
// values respectively in the high surrogates and low surrogates ranges.
//
// JSON text permits the inclusion of Unicode escape sequences within quoted
// strings:
// - Valid BMP code points are encoded as \xXXXX, where XXXX are the base-16
// digits of the code point.
// - Code points larger than BMP are encoded as \uHHHH\uLLLL, where HHHH and
// LLLL are respectively the base-16 digits of the high and low surrogates of
// the UTF-16 encoding of the code point.
inline bool utf16_code_unit_is_bmp(char16_t const c) {
return c < 0xd800 || c >= 0xe000;
}
inline bool utf16_code_unit_is_high_surrogate(char16_t const c) {
return c >= 0xd800 && c < 0xdc00;
}
inline bool utf16_code_unit_is_low_surrogate(char16_t const c) {
return c >= 0xdc00 && c < 0xe000;
}
inline char32_t unicode_code_point_from_utf16_surrogate_pair(
char16_t const high,
char16_t const low) {
if (!utf16_code_unit_is_high_surrogate(high)) {
throw_exception<unicode_error>("invalid high surrogate");
}
if (!utf16_code_unit_is_low_surrogate(low)) {
throw_exception<unicode_error>("invalid low surrogate");
}
return 0x10000 + ((char32_t(high) & 0x3ff) << 10) + (char32_t(low) & 0x3ff);
}
//////////////////////////////////////////////////////////////////////
/*
......
......@@ -558,24 +558,24 @@ std::string decodeUnicodeEscape(Input& in) {
return ret;
};
/*
* If the value encoded is in the surrogate pair range, we need to
* make sure there is another escape that we can use also.
*/
uint32_t codePoint = readHex();
if (codePoint >= 0xd800 && codePoint <= 0xdbff) {
// If the value encoded is in the surrogate pair range, we need to make
// sure there is another escape that we can use also.
//
// See the explanation in folly/Unicode.h.
uint16_t prefix = readHex();
char32_t codePoint = prefix;
if (utf16_code_unit_is_high_surrogate(prefix)) {
if (!in.consume("\\u")) {
in.error(
"expected another unicode escape for second half of "
"surrogate pair");
}
uint16_t second = readHex();
if (second >= 0xdc00 && second <= 0xdfff) {
codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff);
} else {
uint16_t suffix = readHex();
if (!utf16_code_unit_is_low_surrogate(suffix)) {
in.error("second character in surrogate pair is invalid");
}
} else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) {
codePoint = unicode_code_point_from_utf16_surrogate_pair(prefix, suffix);
} else if (!utf16_code_unit_is_bmp(prefix)) {
in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
}
......
......@@ -24,6 +24,53 @@
using folly::utf8ToCodePoint;
class UnicodeTest : public testing::Test {};
TEST_F(UnicodeTest, utf16_code_unit_is_bmp) {
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0000));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0x0041));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xd7ff));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xd800));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdbff));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdc00));
EXPECT_FALSE(folly::utf16_code_unit_is_bmp(0xdfff));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xe000));
EXPECT_TRUE(folly::utf16_code_unit_is_bmp(0xffff));
}
TEST_F(UnicodeTest, utf16_code_unit_is_high_surrogate) {
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0000));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0x0041));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xd7ff));
EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xd800));
EXPECT_TRUE(folly::utf16_code_unit_is_high_surrogate(0xdbff));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdc00));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xdfff));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xe000));
EXPECT_FALSE(folly::utf16_code_unit_is_high_surrogate(0xffff));
}
TEST_F(UnicodeTest, utf16_code_unit_is_low_surrogate) {
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0000));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0x0041));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd7ff));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xd800));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xdbff));
EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdc00));
EXPECT_TRUE(folly::utf16_code_unit_is_low_surrogate(0xdfff));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xe000));
EXPECT_FALSE(folly::utf16_code_unit_is_low_surrogate(0xffff));
}
TEST_F(UnicodeTest, codePointCombineUtf16SurrogatePair) {
EXPECT_THROW(
folly::unicode_code_point_from_utf16_surrogate_pair(0x0041, 0x0041),
folly::unicode_error);
EXPECT_EQ(
0x1CC33,
folly::unicode_code_point_from_utf16_surrogate_pair(0xd833, 0xdc33));
}
void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
{
const unsigned char* p = data.begin();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment