Commit afe40e06 authored by Melissa Winstanley's avatar Melissa Winstanley Committed by Facebook Github Bot

Add serialization option to escape specific ASCII characters

Summary: In some cases, it may be necessary to unicode-escape regular ASCII characters in JSON serialization (example: for JSON sent to browsers that may be interpreted as HTML, "<" should be escaped). Allow additional escape characters to be specified via a bitmap in the serializer options.

Reviewed By: yfeldblum, luciang

Differential Revision: D8980189

fbshipit-source-id: 000c5279ab0f37a3ee4b2eb38f20afa49dcc5a27
parent bc16b096
...@@ -600,6 +600,16 @@ dynamic parseValue(Input& in) { ...@@ -600,6 +600,16 @@ dynamic parseValue(Input& in) {
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
std::array<uint64_t, 2> buildExtraAsciiToEscapeBitmap(StringPiece chars) {
std::array<uint64_t, 2> escapes{{0, 0}};
for (const char c : chars) {
if (c >= 0x20 && c < 0x80) {
escapes[c / 64] |= uint64_t(1) << (c % 64);
}
}
return escapes;
}
std::string serialize(dynamic const& dyn, serialization_opts const& opts) { std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
std::string ret; std::string ret;
unsigned indentLevel = 0; unsigned indentLevel = 0;
...@@ -611,8 +621,8 @@ std::string serialize(dynamic const& dyn, serialization_opts const& opts) { ...@@ -611,8 +621,8 @@ std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
// Fast path to determine the longest prefix that can be left // Fast path to determine the longest prefix that can be left
// unescaped in a string of sizeof(T) bytes packed in an integer of // unescaped in a string of sizeof(T) bytes packed in an integer of
// type T. // type T.
template <class T> template <bool EnableExtraAsciiEscapes, class T>
size_t firstEscapableInWord(T s) { size_t firstEscapableInWord(T s, const serialization_opts& opts) {
static_assert(std::is_unsigned<T>::value, "Unsigned integer required"); static_assert(std::is_unsigned<T>::value, "Unsigned integer required");
static constexpr T kOnes = ~T() / 255; // 0x...0101 static constexpr T kOnes = ~T() / 255; // 0x...0101
static constexpr T kMsbs = kOnes * 0x80; // 0x...8080 static constexpr T kMsbs = kOnes * 0x80; // 0x...8080
...@@ -635,6 +645,25 @@ size_t firstEscapableInWord(T s) { ...@@ -635,6 +645,25 @@ size_t firstEscapableInWord(T s) {
auto isLow = isLess(s, 0x20); // <= 0x1f auto isLow = isLess(s, 0x20); // <= 0x1f
auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"'); auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"');
if /* constexpr */ (EnableExtraAsciiEscapes) {
// Deal with optional bitmap for unicode escapes. Escapes can optionally be
// set for ascii characters 32 - 127, so the inner loop may run up to 96
// times. However, for the case where 0 or a handful of bits are set,
// looping will be minimal through use of findFirstSet.
for (size_t i = 0; i < opts.extra_ascii_to_escape_bitmap.size(); ++i) {
const auto offset = i * 64;
// Clear first 32 characters if this is the first index, since those are
// always escaped.
auto bitmap = opts.extra_ascii_to_escape_bitmap[i] &
(i == 0 ? uint64_t(-1) << 32 : ~0UL);
while (bitmap) {
auto bit = folly::findFirstSet(bitmap);
needsEscape |= isChar(offset + bit - 1);
bitmap &= bitmap - 1;
}
}
}
if (!needsEscape) { if (!needsEscape) {
return sizeof(T); return sizeof(T);
} }
...@@ -647,7 +676,8 @@ size_t firstEscapableInWord(T s) { ...@@ -647,7 +676,8 @@ size_t firstEscapableInWord(T s) {
} }
// Escape a string so that it is legal to print it in JSON text. // Escape a string so that it is legal to print it in JSON text.
void escapeString( template <bool EnableExtraAsciiEscapes>
void escapeStringImpl(
StringPiece input, StringPiece input,
std::string& out, std::string& out,
const serialization_opts& opts) { const serialization_opts& opts) {
...@@ -673,7 +703,7 @@ void escapeString( ...@@ -673,7 +703,7 @@ void escapeString(
} else { } else {
word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail); word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail);
} }
auto prefix = firstEscapableInWord(word); auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts);
DCHECK_LE(prefix, avail); DCHECK_LE(prefix, avail);
firstEsc += prefix; firstEsc += prefix;
if (prefix < 8) { if (prefix < 8) {
...@@ -715,9 +745,19 @@ void escapeString( ...@@ -715,9 +745,19 @@ void escapeString(
} }
} }
} }
if (opts.encode_non_ascii && (*p & 0x80)) {
auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80);
if /* constexpr */ (EnableExtraAsciiEscapes) {
encodeUnicode = encodeUnicode ||
(*p >= 0x20 && *p < 0x80 &&
(opts.extra_ascii_to_escape_bitmap[*p / 64] &
(uint64_t(1) << (*p % 64))));
}
if (encodeUnicode) {
// note that this if condition captures utf8 chars // note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte // with value > 127, so size > 1 byte (or they are whitelisted for
// Unicode encoding).
// NOTE: char32_t / char16_t are both unsigned. // NOTE: char32_t / char16_t are both unsigned.
char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8); char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
auto writeHex = [&](char16_t v) { auto writeHex = [&](char16_t v) {
...@@ -771,6 +811,19 @@ void escapeString( ...@@ -771,6 +811,19 @@ void escapeString(
out.push_back('\"'); out.push_back('\"');
} }
void escapeString(
StringPiece input,
std::string& out,
const serialization_opts& opts) {
if (FOLLY_UNLIKELY(
opts.extra_ascii_to_escape_bitmap[0] ||
opts.extra_ascii_to_escape_bitmap[1])) {
escapeStringImpl<true>(input, out, opts);
} else {
escapeStringImpl<false>(input, out, opts);
}
}
std::string stripComments(StringPiece jsonC) { std::string stripComments(StringPiece jsonC) {
std::string result; std::string result;
enum class State { enum class State {
......
...@@ -68,7 +68,8 @@ struct serialization_opts { ...@@ -68,7 +68,8 @@ struct serialization_opts {
double_num_digits(0), // ignored when mode is SHORTEST double_num_digits(0), // ignored when mode is SHORTEST
double_fallback(false), double_fallback(false),
parse_numbers_as_strings(false), parse_numbers_as_strings(false),
recursion_limit(100) {} recursion_limit(100),
extra_ascii_to_escape_bitmap{{0, 0}} {}
// If true, keys in an object can be non-strings. (In strict // If true, keys in an object can be non-strings. (In strict
// JSON, object keys must be strings.) This is used by dynamic's // JSON, object keys must be strings.) This is used by dynamic's
...@@ -127,8 +128,22 @@ struct serialization_opts { ...@@ -127,8 +128,22 @@ struct serialization_opts {
// Recursion limit when parsing. // Recursion limit when parsing.
unsigned int recursion_limit; unsigned int recursion_limit;
// Bitmap representing ASCII characters to escape with unicode
// representations. The least significant bit of the first in the pair is
// ASCII value 0; the most significant bit of the second in the pair is ASCII
// value 127. Some specific characters in this range are always escaped
// regardless of the bitmask - namely characters less than 0x20, \, and ".
std::array<uint64_t, 2> extra_ascii_to_escape_bitmap;
}; };
/*
* Generates a bitmap with bits set for each of the ASCII characters provided
* for use in the serialization_opts extra_ascii_to_escape_bitmap option. If any
* characters are not valid ASCII, they are ignored.
*/
std::array<uint64_t, 2> buildExtraAsciiToEscapeBitmap(StringPiece chars);
/* /*
* Main JSON serialization routine taking folly::dynamic parameters. * Main JSON serialization routine taking folly::dynamic parameters.
* For the most common use cases there are simpler functions in the * For the most common use cases there are simpler functions in the
...@@ -150,6 +165,7 @@ void escapeString( ...@@ -150,6 +165,7 @@ void escapeString(
* Strip all C99-like comments (i.e. // and / * ... * /) * Strip all C99-like comments (i.e. // and / * ... * /)
*/ */
std::string stripComments(StringPiece jsonC); std::string stripComments(StringPiece jsonC);
} // namespace json } // namespace json
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
......
...@@ -52,6 +52,15 @@ constexpr folly::StringPiece kLargeNonAsciiString = ...@@ -52,6 +52,15 @@ constexpr folly::StringPiece kLargeNonAsciiString =
"qwerty \xc2\x80 \xef\xbf\xbf poiuy" "qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"; "qwerty \xc2\x80 \xef\xbf\xbf poiuy";
constexpr folly::StringPiece kLargeAsciiStringWithSpecialChars =
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>"
"<script>foo%@bar.com</script>";
TEST(Json, StripComments) { TEST(Json, StripComments) {
const std::string kTestDir = "folly/test/"; const std::string kTestDir = "folly/test/";
const std::string kTestFile = "json_test_data/commented.json"; const std::string kTestFile = "json_test_data/commented.json";
...@@ -112,6 +121,18 @@ BENCHMARK(jsonSerializeAsciiWithUtf8Validation, iters) { ...@@ -112,6 +121,18 @@ BENCHMARK(jsonSerializeAsciiWithUtf8Validation, iters) {
} }
} }
BENCHMARK(jsonSerializeWithExtraUnicodeEscapes, iters) {
const dynamic obj = kLargeAsciiStringWithSpecialChars;
folly::json::serialization_opts opts;
opts.extra_ascii_to_escape_bitmap =
folly::json::buildExtraAsciiToEscapeBitmap("<%@");
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(obj, opts);
}
}
BENCHMARK(parseSmallStringWithUtf, iters) { BENCHMARK(parseSmallStringWithUtf, iters) {
for (size_t i = 0; i < iters << 4; ++i) { for (size_t i = 0; i < iters << 4; ++i) {
parseJson("\"I \\u2665 UTF-8 thjasdhkjh blah blah blah\""); parseJson("\"I \\u2665 UTF-8 thjasdhkjh blah blah blah\"");
......
...@@ -608,3 +608,86 @@ TEST(Json, RecursionLimit) { ...@@ -608,3 +608,86 @@ TEST(Json, RecursionLimit) {
opts_high_recursion_limit.recursion_limit = 10000; opts_high_recursion_limit.recursion_limit = 10000;
parseJson(in, opts_high_recursion_limit); parseJson(in, opts_high_recursion_limit);
} }
TEST(Json, ExtraEscapes) {
folly::json::serialization_opts opts;
dynamic in = dynamic::object("a", "<foo@bar%baz?>");
// Only in second index, only first bit of that index.
opts.extra_ascii_to_escape_bitmap =
folly::json::buildExtraAsciiToEscapeBitmap("@");
auto serialized = folly::json::serialize(in, opts);
EXPECT_EQ("{\"a\":\"<foo\\u0040bar%baz?>\"}", serialized);
EXPECT_EQ(in, folly::parseJson(serialized));
// Only last bit.
opts.extra_ascii_to_escape_bitmap =
folly::json::buildExtraAsciiToEscapeBitmap("?");
serialized = folly::json::serialize(in, opts);
EXPECT_EQ("{\"a\":\"<foo@bar%baz\\u003f>\"}", serialized);
EXPECT_EQ(in, folly::parseJson(serialized));
// Multiple bits.
opts.extra_ascii_to_escape_bitmap =
folly::json::buildExtraAsciiToEscapeBitmap("<%@?");
serialized = folly::json::serialize(in, opts);
EXPECT_EQ("{\"a\":\"\\u003cfoo\\u0040bar\\u0025baz\\u003f>\"}", serialized);
EXPECT_EQ(in, folly::parseJson(serialized));
// Non-ASCII
in = dynamic::object("a", "a\xe0\xa0\x80z\xc0\x80");
opts.extra_ascii_to_escape_bitmap =
folly::json::buildExtraAsciiToEscapeBitmap("@");
serialized = folly::json::serialize(in, opts);
EXPECT_EQ("{\"a\":\"a\xe0\xa0\x80z\xc0\x80\"}", serialized);
EXPECT_EQ(in, folly::parseJson(serialized));
}
TEST(Json, CharsToUnicodeEscape) {
auto testPair = [](std::array<uint64_t, 2> arr, uint64_t zero, uint64_t one) {
EXPECT_EQ(zero, arr[0]);
EXPECT_EQ(one, arr[1]);
};
testPair(folly::json::buildExtraAsciiToEscapeBitmap(""), 0, 0);
// ?=63
testPair(folly::json::buildExtraAsciiToEscapeBitmap("?"), (1UL << 63), 0);
// @=64
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("@"), 0, (1UL << (64 - 64)));
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("?@"),
(1UL << 63),
(1UL << (64 - 64)));
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("@?"),
(1UL << 63),
(1UL << (64 - 64)));
// duplicates
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("@?@?"),
(1UL << 63),
(1UL << (64 - 64)));
// ?=63, @=64, $=36
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("?@$"),
(1UL << 63) | (1UL << 36),
(1UL << (64 - 64)));
// ?=63, $=36, @=64, !=33
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("?@$!"),
(1UL << 63) | (1UL << 36) | (1UL << 33),
(1UL << (64 - 64)));
// ?=63, $=36, @=64, !=33, ]=93
testPair(
folly::json::buildExtraAsciiToEscapeBitmap("?@$!]"),
(1UL << 63) | (1UL << 36) | (1UL << 33),
(1UL << (64 - 64)) | (1UL << (93 - 64)));
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment