Commit e18efdc4 authored by Hari Manikarnika's avatar Hari Manikarnika Committed by Jordan DeLong

make folly::toJson retain non-ascii chars if encode_non_ascii is disabled

Summary:
folly::toJson as demonstrated by the test cases was wrongly encoding utf8 strings.
specifically, a utf8 char made up of x bytes was encodeded into x unicode chars.

for example, the char:
\u2665
which is made of 3 bytes:
\xe2\x99\xa5
was encoded correctly when using encode_non_ascii = true:
"\u2665"
but when encode_non_ascii = false, the json value was wrongly set as:
"\u00e2\u0099\u00a5"

because we use an signed char that wrongly detects non-readable chars with
ascii value > 127 as control chars with ascii value < 32 (\t, \n, etc.)

Test Plan: run the test

Reviewed By: delong.j@fb.com

FB internal diff: D612782
parent 6e230a3f
...@@ -30,7 +30,7 @@ namespace folly { ...@@ -30,7 +30,7 @@ namespace folly {
namespace json { namespace json {
namespace { namespace {
char32_t decodeUtf8(const char*& p, const char* const e) { char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
/* The following encodings are valid, except for the 5 and 6 byte /* The following encodings are valid, except for the 5 and 6 byte
* combinations: * combinations:
* 0xxxxxxx * 0xxxxxxx
...@@ -115,9 +115,9 @@ void escapeString(StringPiece input, ...@@ -115,9 +115,9 @@ void escapeString(StringPiece input,
out.reserve(out.size() + input.size() + 2); out.reserve(out.size() + input.size() + 2);
out.push_back('\"'); out.push_back('\"');
const char* p = input.begin(); auto* p = reinterpret_cast<const unsigned char*>(input.begin());
const char* q = input.begin(); auto* q = reinterpret_cast<const unsigned char*>(input.begin());
const char* const e = input.end(); auto* e = reinterpret_cast<const unsigned char*>(input.end());
while (p < e) { while (p < e) {
// Since non-ascii encoding inherently does utf8 validation // Since non-ascii encoding inherently does utf8 validation
...@@ -139,6 +139,8 @@ void escapeString(StringPiece input, ...@@ -139,6 +139,8 @@ void escapeString(StringPiece input,
} }
if (opts.encode_non_ascii && (*p & 0x80)) { if (opts.encode_non_ascii && (*p & 0x80)) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = decodeUtf8(p, e); char32_t v = decodeUtf8(p, e);
out.append("\\u"); out.append("\\u");
out.push_back(hexDigit(v >> 12)); out.push_back(hexDigit(v >> 12));
...@@ -156,8 +158,8 @@ void escapeString(StringPiece input, ...@@ -156,8 +158,8 @@ void escapeString(StringPiece input,
case '\r': out.append("\\r"); p++; break; case '\r': out.append("\\r"); p++; break;
case '\t': out.append("\\t"); p++; break; case '\t': out.append("\\t"); p++; break;
default: default:
// note that this if condition captures both control characters // note that this if condition captures non readable chars
// and extended ascii characters // with value < 32, so size = 1 byte (e.g control chars).
out.append("\\u00"); out.append("\\u00");
out.push_back(hexDigit((*p & 0xf0) >> 4)); out.push_back(hexDigit((*p & 0xf0) >> 4));
out.push_back(hexDigit(*p & 0xf)); out.push_back(hexDigit(*p & 0xf));
......
...@@ -246,20 +246,70 @@ TEST(Json, JsonNonAsciiEncoding) { ...@@ -246,20 +246,70 @@ TEST(Json, JsonNonAsciiEncoding) {
EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts)); EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts));
} }
TEST(Json, UTF8Retention) {
// test retention with valid utf8 strings
folly::fbstring input = "\u2665";
folly::fbstring jsonInput = folly::toJson(input);
folly::fbstring output = folly::parseJson(jsonInput).asString();
folly::fbstring jsonOutput = folly::toJson(output);
LOG(INFO) << "input: " << input
<<" => json: " << jsonInput;
LOG(INFO) << "output: " << output
<<" => json: " << jsonOutput;
EXPECT_EQ(input, output);
EXPECT_EQ(jsonInput, jsonOutput);
// test retention with invalid utf8 - note that non-ascii chars are retained
// as is, and no unicode encoding is attempted so no exception is thrown.
EXPECT_EQ(
folly::toJson("a\xe0\xa0\x80z\xc0\x80"),
"\"a\xe0\xa0\x80z\xc0\x80\""
);
}
TEST(Json, UTF8EncodeNonAsciiRetention) {
folly::json::serialization_opts opts;
opts.encode_non_ascii = true;
// test encode_non_ascii valid utf8 strings
folly::fbstring input = "\u2665";
folly::fbstring jsonInput = folly::json::serialize(input, opts);
folly::fbstring output = folly::parseJson(jsonInput).asString();
folly::fbstring jsonOutput = folly::json::serialize(output, opts);
LOG(INFO) << "input: " << input
<<" => json: " << jsonInput;
LOG(INFO) << "output: " << output
<<" => json: " << jsonOutput;
EXPECT_EQ(input, output);
EXPECT_EQ(jsonInput, jsonOutput);
// test encode_non_ascii with invalid utf8 - note that an attempt to encode
// non-ascii to unicode will result is a utf8 validation and throw exceptions.
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
}
TEST(Json, UTF8Validation) { TEST(Json, UTF8Validation) {
folly::json::serialization_opts opts; folly::json::serialization_opts opts;
opts.validate_utf8 = true; opts.validate_utf8 = true;
// valid utf8 strings // test validate_utf8 valid utf8 strings - note that we only validate the
EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), R"("a\u00c2\u0080z")"); // for utf8 but don't encode non-ascii to unicode so they are retained as is.
EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), "\"a\xc2\x80z\"");
EXPECT_EQ( EXPECT_EQ(
folly::json::serialize("a\xe0\xa0\x80z", opts), folly::json::serialize("a\xe0\xa0\x80z", opts),
R"("a\u00e0\u00a0\u0080z")"); "\"a\xe0\xa0\x80z\"");
EXPECT_EQ( EXPECT_EQ(
folly::json::serialize("a\xe0\xa0\x80m\xc2\x80z", opts), folly::json::serialize("a\xe0\xa0\x80m\xc2\x80z", opts),
R"("a\u00e0\u00a0\u0080m\u00c2\u0080z")"); "\"a\xe0\xa0\x80m\xc2\x80z\"");
// test with invalid utf8 // test validate_utf8 with invalid utf8
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts)); EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment