Commit ae03ef83 authored by Xiayi Sun's avatar Xiayi Sun Committed by Facebook GitHub Bot

folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return...

folly: utf8ToCodePoint: enforce max valid code point is U+10FFFF - return U+FFFD / throw for well-formed UTF-8 encoded values that are larger than the max code point

Summary:
UTF-8 can encode large numbers, but Unicode code points are only defined up to `U+10FFFF`.

For example:
- the 4B UTF-8 encoding `"\xF6\x8D\x9B\ xBC"` (bits: `11110110 10001101 10011011 10111100`) is a valid UTF-8 encoding
- but the encoded value is `U+18D6 (https://github.com/facebook/folly/commit/d40182262d41679cab28f6be7366cc5ff901683b)FC` which is larger than `U+10FFFF`

With `opts.skip_invalid_utf8 = true;` `json::serialize` should have returned `"\ufffd"` since it the input is invalid, but due to a bug in `utf8ToCodePoint` it returned the incorrect `"\"\xF6\x8D\x9B\xBC\""`.

Update `utf8ToCodePoint` to also reject 4 byte UTF-8 encoded values larger than the max Unicode code point (`U+10FFFF`).

Reviewed By: luciang

Differential Revision: D25275722

fbshipit-source-id: e7daeea834a0c5323923a5451a2565ceff5e4734
parent 67f20f29
...@@ -53,15 +53,30 @@ char32_t utf8ToCodePoint( ...@@ -53,15 +53,30 @@ char32_t utf8ToCodePoint(
const unsigned char*& p, const unsigned char*& p,
const unsigned char* const e, const unsigned char* const e,
bool skipOnError) { bool skipOnError) {
/* The following encodings are valid, except for the 5 and 6 byte // clang-format off
* combinations: /** UTF encodings
* 0xxxxxxx * | # of B | First CP | Last CP | Bit Pattern
* 110xxxxx 10xxxxxx * | 1 | 0x0000 | 0x007F | 0xxxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx * | 2 | 0x0080 | 0x07FF | 110xxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * | 3 | 0x0800 | 0xFFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * | 4 | 0x10000 | 0x10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * | 5 | - | - | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/ * | 6 | - | - | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
*
* NOTE:
* - the 4B encoding can encode values up to 0x1FFFFF,
* but Unicode defines 0x10FFFF to be the largest code point
* - the 5B & 6B encodings will all encode values larger than 0x1FFFFF
* (so larger than the largest code point value 0x10FFFF) so they form invalid
* unicode code points
*
* On invalid input (invalid encoding or code points larger than 0x10FFFF):
* - When skipOnError is true, this function will skip the first byte and return
* U'\ufffd'. Potential optimization: skip the whole invalid range.
* - When skipOnError is false, throws.
*/
// clang-format on
const auto skip = [&] { const auto skip = [&] {
++p; ++p;
...@@ -77,7 +92,7 @@ char32_t utf8ToCodePoint( ...@@ -77,7 +92,7 @@ char32_t utf8ToCodePoint(
unsigned char fst = *p; unsigned char fst = *p;
if (!(fst & 0x80)) { if (!(fst & 0x80)) {
// trivial case // trivial case, 1 byte encoding
return *p++; return *p++;
} }
...@@ -91,6 +106,7 @@ char32_t utf8ToCodePoint( ...@@ -91,6 +106,7 @@ char32_t utf8ToCodePoint(
// upper control bits are masked out later // upper control bits are masked out later
uint32_t d = fst; uint32_t d = fst;
// multi-byte encoded values must start with bits 0b11. 0xC0 is 0b11000000
if ((fst & 0xC0) != 0xC0) { if ((fst & 0xC0) != 0xC0) {
if (skipOnError) { if (skipOnError) {
return skip(); return skip();
...@@ -104,6 +120,7 @@ char32_t utf8ToCodePoint( ...@@ -104,6 +120,7 @@ char32_t utf8ToCodePoint(
for (unsigned int i = 1; i != 4 && p + i < e; ++i) { for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
const unsigned char tmp = p[i]; const unsigned char tmp = p[i];
// from the second byte on, format should be 10xxxxxx
if ((tmp & 0xC0) != 0x80) { if ((tmp & 0xC0) != 0x80) {
if (skipOnError) { if (skipOnError) {
return skip(); return skip();
...@@ -112,11 +129,15 @@ char32_t utf8ToCodePoint( ...@@ -112,11 +129,15 @@ char32_t utf8ToCodePoint(
"folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp)); "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
} }
// gradually fill a 32 bit integer d with non control bits in tmp
// 0x3F is 0b00111111 which clears out the first 2 control bits
d = (d << 6) | (tmp & 0x3F); d = (d << 6) | (tmp & 0x3F);
fst <<= 1; fst <<= 1;
if (!(fst & 0x80)) { if (!(fst & 0x80)) {
d &= bitMask[i]; // We know the length of encoding now, since we encounter the first "0" in
// fst (the first byte). This branch processes the last byte of encoding.
d &= bitMask[i]; // d is now the code point
// overlong, could have been encoded with i bytes // overlong, could have been encoded with i bytes
if ((d & ~bitMask[i - 1]) == 0) { if ((d & ~bitMask[i - 1]) == 0) {
...@@ -129,7 +150,7 @@ char32_t utf8ToCodePoint( ...@@ -129,7 +150,7 @@ char32_t utf8ToCodePoint(
// check for surrogates only needed for 3 bytes // check for surrogates only needed for 3 bytes
if (i == 2) { if (i == 2) {
if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { if (d >= 0xD800 && d <= 0xDFFF) {
if (skipOnError) { if (skipOnError) {
return skip(); return skip();
} }
...@@ -138,6 +159,16 @@ char32_t utf8ToCodePoint( ...@@ -138,6 +159,16 @@ char32_t utf8ToCodePoint(
} }
} }
// While UTF-8 encoding can encode arbitrary numbers, 0x10FFFF is the
// largest defined Unicode code point.
// Only >=4 bytes can UTF-8 encode such values, so i=3 here.
if (d > 0x10FFFF) {
if (skipOnError) {
return skip();
}
throw std::runtime_error(
"folly::utf8ToCodePoint encoding exceeds max unicode code point");
}
p += i + 1; p += i + 1;
return d; return d;
} }
......
...@@ -573,6 +573,8 @@ TEST(Json, UTF8Validation) { ...@@ -573,6 +573,8 @@ TEST(Json, UTF8Validation) {
// test validate_utf8 with invalid utf8 // test validate_utf8 with invalid utf8
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
// not a valid unicode because it is larger than the max 0x10FFFF code-point
EXPECT_ANY_THROW(folly::json::serialize("\xF6\x8D\x9B\xBC", opts));
opts.skip_invalid_utf8 = true; opts.skip_invalid_utf8 = true;
EXPECT_EQ( EXPECT_EQ(
...@@ -584,6 +586,12 @@ TEST(Json, UTF8Validation) { ...@@ -584,6 +586,12 @@ TEST(Json, UTF8Validation) {
EXPECT_EQ( EXPECT_EQ(
folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts), folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts),
u8"\"z\ufffd\ufffdz\xe0\xa0\x80\""); u8"\"z\ufffd\ufffdz\xe0\xa0\x80\"");
EXPECT_EQ(
folly::json::serialize("\xF6\x8D\x9B\xBC", opts),
u8"\"\ufffd\ufffd\ufffd\ufffd\"");
EXPECT_EQ(
folly::json::serialize("invalid\xF6\x8D\x9B\xBCinbetween", opts),
u8"\"invalid\ufffd\ufffd\ufffd\ufffdinbetween\"");
opts.encode_non_ascii = true; opts.encode_non_ascii = true;
EXPECT_EQ( EXPECT_EQ(
......
...@@ -106,6 +106,10 @@ void testInvalid(std::initializer_list<unsigned char> data) { ...@@ -106,6 +106,10 @@ void testInvalid(std::initializer_list<unsigned char> data) {
} }
} }
TEST(InvalidUtf8ToCodePoint, UnicodeOutOfRangeTest) {
testInvalid({0xF4, 0x90, 0x80, 0x80}); // u8"\U0010FFFF" + 1
}
TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) { TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
// https://tools.ietf.org/html/rfc3629 // https://tools.ietf.org/html/rfc3629
// Implementations of the decoding algorithm above MUST protect against // Implementations of the decoding algorithm above MUST protect against
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment