Commit 07a29570 authored by Lucian Grijincu's avatar Lucian Grijincu Committed by Facebook Github Bot

folly: unicode: support 4 byte unicode code points

Summary:
Support 4-byte UTF-8 strings.

https://en.wikipedia.org/wiki/UTF-8
| Number of bytes | Bits for code point | First code point | Last code point |   Byte 1 |   Byte 2 |   Byte 3 |   Byte 4
|               1 |                   7 |           U+0000 |          U+007F | 0xxxxxxx
|               2 |                  11 |           U+0080 |          U+07FF | 110xxxxx | 10xxxxxx
|               3 |                  16 |           U+0800 |          U+FFFF | 1110xxxx | 10xxxxxx | 10xxxxxx
|               4 |                  21 |          U+10000 |        U+10FFFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx

`utf8ToCodePoint` now correctly returns the code point for 4-byte UTF-8 strings.

The JSON standard (http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf) says:

> Any code point may be represented as a hexadecimal escape sequence. The meaning of such a hexadecimal
number is determined by ISO/IEC 10646. If the code point is in the Basic Multilingual Plane (U+0000 through
U+FFFF), then it may be represented as a six-character sequence: a reverse solidus, followed by the
lowercase letter u, followed by four hexadecimal digits that encode the code point. Hexadecimal digits can be

This was the behavior implemented for 2-3-byte UTF-8 strings which all fit in the U+0000 through U+FFFF range (see above diagram).

> To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
explicit surrogate pair is a semantic decision that is determined by the specific processor.

When `encode_non_ascii == true` we now also support encoding 4-byte UTF-8 strings as 2 UTF-16 surrogate pairs.

Reviewed By: ot

Differential Revision: D9357479

fbshipit-source-id: 5c47bee4e71d5888b8264d8d3524d2084769adb0
parent 8fdcdd4c
......@@ -101,7 +101,7 @@ char32_t utf8ToCodePoint(
fst <<= 1;
for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
const unsigned char tmp = p[i];
if ((tmp & 0xC0) != 0x80) {
......
......@@ -718,13 +718,31 @@ void escapeString(
if (opts.encode_non_ascii && (*p & 0x80)) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
char buf[] = "\\u\0\0\0\0";
buf[2] = hexDigit(uint8_t(v >> 12));
buf[3] = hexDigit((v >> 8) & 0x0f);
buf[4] = hexDigit((v >> 4) & 0x0f);
buf[5] = hexDigit(v & 0x0f);
out.append(buf, 6);
// NOTE: char32_t / char16_t are both unsigned.
char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
auto writeHex = [&](char16_t v) {
char buf[] = "\\u\0\0\0\0";
buf[2] = hexDigit((v >> 12) & 0x0f);
buf[3] = hexDigit((v >> 8) & 0x0f);
buf[4] = hexDigit((v >> 4) & 0x0f);
buf[5] = hexDigit(v & 0x0f);
out.append(buf, 6);
};
// From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
if (cp < 0x10000u) {
// If the code point is in the Basic Multilingual Plane (U+0000 through
// U+FFFF), then it may be represented as a six-character sequence:
// a reverse solidus, followed by the lowercase letter u, followed by
// four hexadecimal digits that encode the code point.
writeHex(static_cast<char16_t>(cp));
} else {
// To escape a code point that is not in the Basic Multilingual Plane,
// the character may be represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair corresponding to the code point.
writeHex(static_cast<char16_t>(
0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
}
} else if (*p == '\\' || *p == '\"') {
char buf[] = "\\\0";
buf[1] = char(*p++);
......
......@@ -86,7 +86,9 @@ struct serialization_opts {
// try to be minimally "pretty".
bool pretty_formatting;
// If true, non-ASCII utf8 characters would be encoded as \uXXXX.
// If true, non-ASCII utf8 characters would be encoded as \uXXXX:
// - if the code point is in [U+0000..U+FFFF] => encode as a single \uXXXX
// - if the code point is > U+FFFF => encode as 2 UTF-16 surrogate pairs.
bool encode_non_ascii;
// Check that strings are valid utf8
......
......@@ -299,8 +299,13 @@ TEST(Json, JsonNonAsciiEncoding) {
EXPECT_ANY_THROW(folly::json::serialize("\xc0\x80", opts));
EXPECT_ANY_THROW(folly::json::serialize("\xe0\x80\x80", opts));
// Longer than 3 byte encodings
EXPECT_ANY_THROW(folly::json::serialize("\xf4\x8f\xbf\xbf", opts));
// Allow 4 byte encodings, escape using 2 UTF-16 surrogate pairs.
// "\xf0\x9f\x8d\x80" is Unicode Character 'FOUR LEAF CLOVER' (U+1F340)
// >>> json.dumps({"a": u"\U0001F340"})
// '{"a": "\\ud83c\\udf40"}'
EXPECT_EQ(
folly::json::serialize("\xf0\x9f\x8d\x80", opts), R"("\ud83c\udf40")");
// Longer than 4 byte encodings
EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts));
}
......
/*
* Copyright 2018-present Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/Unicode.h>
#include <initializer_list>
#include <stdexcept>
#include <folly/Range.h>
#include <folly/portability/GTest.h>
using folly::utf8ToCodePoint;
void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
{
const unsigned char* p = data.begin();
const unsigned char* e = data.end();
EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ false), expected)
<< folly::StringPiece(
(const char*)data.begin(), (const char*)data.end());
}
{
const unsigned char* p = data.begin();
const unsigned char* e = data.end();
EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), expected)
<< folly::StringPiece(
(const char*)data.begin(), (const char*)data.end());
}
}
void testInvalid(std::initializer_list<unsigned char> data) {
{
const unsigned char* p = data.begin();
const unsigned char* e = data.end();
EXPECT_THROW(
utf8ToCodePoint(p, e, /* skipOnError */ false), std::runtime_error)
<< folly::StringPiece(
(const char*)data.begin(), (const char*)data.end());
}
{
const unsigned char* p = data.begin();
const unsigned char* e = data.end();
EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), 0xfffd)
<< folly::StringPiece(
(const char*)data.begin(), (const char*)data.end());
}
}
TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
// https://tools.ietf.org/html/rfc3629
// Implementations of the decoding algorithm above MUST protect against
// decoding invalid sequences. For instance, a naive implementation may
// decode the overlong UTF-8 sequence C0 80 into the character U+0000 [...]
// Decoding invalid sequences may have security consequences or cause other
// problems.
testInvalid({0xC0, 0x80});
}
TEST(InvalidUtf8ToCodePoint, rfc3629SurrogatePair) {
// https://tools.ietf.org/html/rfc3629
// Implementations of the decoding algorithm above MUST protect against
// decoding invalid sequences. For instance, a naive implementation may
// decode [...] the surrogate pair ED A1 8C ED BE B4 into U+233B4.
// Decoding invalid sequences may have security consequences or cause other
// problems.
testInvalid({0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4});
}
TEST(InvalidUtf8ToCodePoint, MarkusKuhnSingleUTF16Surrogates) {
// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
// 5.1.1 U+D800 = ed a0 80
// 5.1.2 U+DB7F = ed ad bf
// 5.1.3 U+DB80 = ed ae 80
// 5.1.4 U+DBFF = ed af bf
// 5.1.5 U+DC00 = ed b0 80
// 5.1.6 U+DF80 = ed be 80
// 5.1.7 U+DFFF = ed bf bf
testInvalid({0xed, 0xa0, 0x80});
testInvalid({0xed, 0xad, 0xbf});
testInvalid({0xed, 0xae, 0x80});
testInvalid({0xed, 0xaf, 0xbf});
testInvalid({0xed, 0xb0, 0x80});
testInvalid({0xed, 0xbe, 0x80});
testInvalid({0xed, 0xbf, 0xbf});
}
TEST(InvalidUtf8ToCodePoint, MarkusKuhnPairedUTF16Surrogates) {
// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
// 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
// 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
// 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
// 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
// 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
// 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
// 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
// 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
testInvalid({0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80});
testInvalid({0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf});
testInvalid({0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80});
testInvalid({0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf});
testInvalid({0xed, 0xae, 0x80, 0xed, 0xb0, 0x80});
testInvalid({0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf});
testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80});
testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf});
}
TEST(ValidUtf8ToCodePoint, FourCloverLeaf) {
testValid({0xF0, 0x9F, 0x8D, 0x80}, 0x1F340); // u8"\U0001F340";
}
TEST(InvalidUtf8ToCodePoint, FourCloverLeafAsSurrogates) {
testInvalid({0xd8, 0x3c, 0xdf, 0x40}); // u8"\U0001F340";
}
TEST(ValidUtf8ToCodePoint, LastCodePoint) {
testValid({0xF4, 0x8F, 0xBF, 0xBF}, 0x10FFFF); // u8"\U0010FFFF";
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment