Commit a91e75db authored by Mark McDuff's avatar Mark McDuff Committed by Jordan DeLong

folly::json: allow skipping invalid UTF8

Summary:
folly::json::serialize by default doesn't check for valid UTF8, and as a result can generate invalid JSON.  There is an option to check for valid UTF8, which throws on an error.

This diff introduces a new option, `skip_invalid`, which replaces invalid chars with U+FFFD. http://en.wikipedia.org/wiki/Specials_(Unicode_block) seems to suggest that this is the correct replacement.

@override-unit-failures

Test Plan: g-unittest

Reviewed By: delong.j@fb.com

FB internal diff: D1102923
parent 484392b0
......@@ -30,7 +30,10 @@ namespace folly {
namespace json {
namespace {
char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
char32_t decodeUtf8(
const unsigned char*& p,
const unsigned char* const e,
bool skipOnError) {
/* The following encodings are valid, except for the 5 and 6 byte
* combinations:
* 0xxxxxxx
......@@ -41,7 +44,10 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
auto skip = [&] { ++p; return U'\ufffd'; };
if (p >= e) {
if (skipOnError) return skip();
throw std::runtime_error("folly::decodeUtf8 empty/invalid string");
}
......@@ -62,8 +68,8 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
uint32_t d = fst;
if ((fst & 0xC0) != 0xC0) {
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=0 d=", d));
if (skipOnError) return skip();
throw std::runtime_error(to<std::string>("folly::decodeUtf8 i=0 d=", d));
}
fst <<= 1;
......@@ -72,6 +78,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
unsigned char tmp = p[i];
if ((tmp & 0xC0) != 0x80) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp));
}
......@@ -84,6 +91,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
// overlong, could have been encoded with i bytes
if ((d & ~bitMask[i - 1]) == 0) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
}
......@@ -91,6 +99,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
// check for surrogates only needed for 3 bytes
if (i == 2) {
if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
}
......@@ -101,6 +110,7 @@ char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
}
}
if (skipOnError) return skip();
throw std::runtime_error("folly::decodeUtf8 encoding length maxed out");
}
......@@ -642,7 +652,8 @@ void escapeString(StringPiece input,
while (p < e) {
// Since non-ascii encoding inherently does utf8 validation
// we explicitly validate utf8 only if non-ascii encoding is disabled.
if (opts.validate_utf8 && !opts.encode_non_ascii) {
if ((opts.validate_utf8 || opts.skip_invalid_utf8)
&& !opts.encode_non_ascii) {
// to achieve better spatial and temporal coherence
// we do utf8 validation progressively along with the
// string-escaping instead of two separate passes
......@@ -654,13 +665,18 @@ void escapeString(StringPiece input,
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
decodeUtf8(q, e);
char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8);
if (opts.skip_invalid_utf8 && v == U'\ufffd') {
out.append("\ufffd");
p = q;
continue;
}
}
}
if (opts.encode_non_ascii && (*p & 0x80)) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = decodeUtf8(p, e);
char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8);
out.append("\\u");
out.push_back(hexDigit(v >> 12));
out.push_back(hexDigit((v >> 8) & 0x0f));
......
......@@ -60,6 +60,7 @@ namespace json {
, validate_utf8(false)
, allow_trailing_comma(false)
, sort_keys(false)
, skip_invalid_utf8(false)
{}
// If true, keys in an object can be non-strings. (In strict
......@@ -89,6 +90,9 @@ namespace json {
// Sort keys of all objects before printing out (potentially slow)
bool sort_keys;
// Replace invalid utf8 characters with U+FFFD and continue
bool skip_invalid_utf8;
};
/*
......
......@@ -302,6 +302,23 @@ TEST(Json, UTF8Validation) {
// test validate_utf8 with invalid utf8
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
opts.skip_invalid_utf8 = true;
EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts),
"\"a\xe0\xa0\x80z\ufffd\ufffd\"");
EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80\x80", opts),
"\"a\xe0\xa0\x80z\ufffd\ufffd\ufffd\"");
EXPECT_EQ(folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts),
"\"z\ufffd\ufffdz\xe0\xa0\x80\"");
opts.encode_non_ascii = true;
EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts),
"\"a\\u0800z\\ufffd\\ufffd\"");
EXPECT_EQ(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80\x80", opts),
"\"a\\u0800z\\ufffd\\ufffd\\ufffd\"");
EXPECT_EQ(folly::json::serialize("z\xc0\x80z\xe0\xa0\x80", opts),
"\"z\\ufffd\\ufffdz\\u0800\"");
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment