Commit 2f0cabfb authored by Giuseppe Ottaviano's avatar Giuseppe Ottaviano Committed by Facebook Github Bot

Optimize JSON escaping of ASCII strings

Summary:
`escapeString` is very slow even when there's very little to nothing to escape. This diff adds a fast path to copy sequences of bytes that don't need escaping.
It also optimizes appending escape sequences: `string::push_back` is slow because it has to do a capacity check for every character.

Before:
```
  ============================================================================
  folly/test/JsonOtherTest.cpp                    relative  time/iter  iters/s
  ============================================================================
  jsonSerialize                                              818.55ns    1.22M
  jsonSerializeWithNonAsciiEncoding                            1.35us  738.06K
  jsonSerializeWithUtf8Validation                              1.42us  705.60K
  jsonSerializeAsciiWithUtf8Validation                         3.27us  306.06K
  parseSmallStringWithUtf                                      1.91us  522.38K
  parseNormalString                                            1.51us  660.27K
  parseBigString                                             384.44ns    2.60M
  toJson                                                     480.54ns    2.08M
  ============================================================================
```

After:
```
  ============================================================================
  folly/test/JsonOtherTest.cpp                    relative  time/iter  iters/s
  ============================================================================
  jsonSerialize                                              781.69ns    1.28M
  jsonSerializeWithNonAsciiEncoding                          847.68ns    1.18M
  jsonSerializeWithUtf8Validation                            928.68ns    1.08M
  jsonSerializeAsciiWithUtf8Validation                       199.85ns    5.00M
  parseSmallStringWithUtf                                      1.93us  518.39K
  parseNormalString                                            1.45us  689.11K
  parseBigString                                             378.66ns    2.64M
  toJson                                                     446.38ns    2.24M
  ============================================================================
```

All string escaping benchmarks are slightly faster, and ASCII-only with no escapes is 8x faster.

Reviewed By: luciang, evilmucedin, yfeldblum

Differential Revision: D4793233

fbshipit-source-id: c40d07708bd787799c8c00f9f23a417b862ee9ae
parent 8e16a2eb
......@@ -17,11 +17,13 @@
#include <folly/json.h>
#include <algorithm>
#include <cassert>
#include <functional>
#include <type_traits>
#include <boost/next_prior.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/next_prior.hpp>
#include <folly/Bits.h>
#include <folly/Portability.h>
#include <folly/Conv.h>
#include <folly/Range.h>
......@@ -344,7 +346,7 @@ std::string parseString(Input& in);
dynamic parseNumber(Input& in);
dynamic parseObject(Input& in) {
assert(*in == '{');
DCHECK_EQ(*in, '{');
++in;
dynamic ret = dynamic::object;
......@@ -388,7 +390,7 @@ dynamic parseObject(Input& in) {
}
dynamic parseArray(Input& in) {
assert(*in == '[');
DCHECK_EQ(*in, '[');
++in;
dynamic ret = dynamic::array;
......@@ -523,7 +525,7 @@ std::string decodeUnicodeEscape(Input& in) {
}
std::string parseString(Input& in) {
assert(*in == '\"');
DCHECK_EQ(*in, '\"');
++in;
std::string ret;
......@@ -607,6 +609,44 @@ std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
return ret;
}
// Fast path to determine the longest prefix that can be left
// unescaped in a string of sizeof(T) bytes packed in an integer of
// type T.
template <class T>
size_t firstEscapableInWord(T s) {
static_assert(std::is_unsigned<T>::value, "Unsigned integer required");
constexpr T kOnes = ~T() / 255; // 0x...0101
constexpr T kMsbs = kOnes * 0x80; // 0x...8080
// Sets the MSB of bytes < b. Precondition: b < 128.
auto isLess = [](T w, uint8_t b) {
// A byte is < b iff subtracting b underflows, so we check that
// the MSB wasn't set before and it's set after the subtraction.
return (w - kOnes * b) & ~w & kMsbs;
};
auto isChar = [&](uint8_t c) {
// A byte is == c iff it is 0 if xored with c.
return isLess(s ^ (kOnes * c), 1);
};
// The following masks have the MSB set for each byte of the word
// that satisfies the corresponding condition.
auto isHigh = s & kMsbs; // >= 128
auto isLow = isLess(s, 0x20); // <= 0x1f
auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"');
if (!needsEscape) {
return sizeof(T);
}
if (folly::kIsLittleEndian) {
return folly::findFirstSet(needsEscape) / 8 - 1;
} else {
return sizeof(T) - folly::findLastSet(needsEscape) / 8;
}
}
// Escape a string so that it is legal to print it in JSON text.
void escapeString(
StringPiece input,
......@@ -623,18 +663,48 @@ void escapeString(
auto* e = reinterpret_cast<const unsigned char*>(input.end());
while (p < e) {
// Find the longest prefix that does not need escaping, and copy
// it literally into the output string.
auto firstEsc = p;
while (firstEsc < e) {
auto avail = e - firstEsc;
uint64_t word = 0;
if (avail >= 8) {
word = folly::loadUnaligned<uint64_t>(firstEsc);
} else {
memcpy(static_cast<void*>(&word), firstEsc, avail);
}
auto prefix = firstEscapableInWord(word);
DCHECK_LE(prefix, avail);
firstEsc += prefix;
if (prefix < 8) {
break;
}
}
if (firstEsc > p) {
out.append(reinterpret_cast<const char*>(p), firstEsc - p);
p = firstEsc;
// We can't be in the middle of a multibyte sequence, so we can reset q.
q = p;
if (p == e) {
break;
}
}
// Handle the next byte that may need escaping.
// Since non-ascii encoding inherently does utf8 validation
// we explicitly validate utf8 only if non-ascii encoding is disabled.
if ((opts.validate_utf8 || opts.skip_invalid_utf8)
&& !opts.encode_non_ascii) {
// to achieve better spatial and temporal coherence
// To achieve better spatial and temporal coherence
// we do utf8 validation progressively along with the
// string-escaping instead of two separate passes
// string-escaping instead of two separate passes.
// as the encoding progresses, q will stay at or ahead of p
CHECK(q >= p);
// As the encoding progresses, q will stay at or ahead of p.
CHECK_GE(q, p);
// as p catches up with q, move q forward
// As p catches up with q, move q forward.
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
......@@ -650,14 +720,16 @@ void escapeString(
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
out.append("\\u");
out.push_back(hexDigit(uint8_t(v >> 12)));
out.push_back(hexDigit((v >> 8) & 0x0f));
out.push_back(hexDigit((v >> 4) & 0x0f));
out.push_back(hexDigit(v & 0x0f));
char buf[] = "\\u\0\0\0\0";
buf[2] = hexDigit(uint8_t(v >> 12));
buf[3] = hexDigit((v >> 8) & 0x0f);
buf[4] = hexDigit((v >> 4) & 0x0f);
buf[5] = hexDigit(v & 0x0f);
out.append(buf, 6);
} else if (*p == '\\' || *p == '\"') {
out.push_back('\\');
out.push_back(char(*p++));
char buf[] = "\\\0";
buf[1] = char(*p++);
out.append(buf, 2);
} else if (*p <= 0x1f) {
switch (*p) {
case '\b': out.append("\\b"); p++; break;
......@@ -666,11 +738,12 @@ void escapeString(
case '\r': out.append("\\r"); p++; break;
case '\t': out.append("\\t"); p++; break;
default:
// note that this if condition captures non readable chars
// Note that this if condition captures non readable chars
// with value < 32, so size = 1 byte (e.g control chars).
out.append("\\u00");
out.push_back(hexDigit(uint8_t((*p & 0xf0) >> 4)));
out.push_back(hexDigit(uint8_t(*p & 0xf)));
char buf[] = "\\u00\0\0";
buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
buf[5] = hexDigit(uint8_t(*p & 0xf));
out.append(buf, 6);
p++;
}
} else {
......
......@@ -17,7 +17,9 @@
#include <folly/json.h>
#include <folly/Benchmark.h>
#include <folly/Conv.h>
#include <folly/FileUtil.h>
#include <folly/Range.h>
#include <folly/portability/GFlags.h>
#include <folly/portability/GTest.h>
......@@ -25,6 +27,31 @@ using folly::dynamic;
using folly::parseJson;
using folly::toJson;
constexpr folly::StringPiece kLargeAsciiString =
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk";
constexpr folly::StringPiece kLargeNonAsciiString =
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy";
TEST(Json, StripComments) {
const std::string kTestDir = "folly/test/";
const std::string kTestFile = "json_test_data/commented.json";
......@@ -44,60 +71,44 @@ TEST(Json, StripComments) {
}
BENCHMARK(jsonSerialize, iters) {
const dynamic obj = kLargeNonAsciiString;
folly::json::serialization_opts opts;
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}
BENCHMARK(jsonSerializeWithNonAsciiEncoding, iters) {
const dynamic obj = kLargeNonAsciiString;
folly::json::serialization_opts opts;
opts.encode_non_ascii = true;
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}
BENCHMARK(jsonSerializeWithUtf8Validation, iters) {
const dynamic obj = kLargeNonAsciiString;
folly::json::serialization_opts opts;
opts.validate_utf8 = true;
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}
BENCHMARK(jsonSerializeAsciiWithUtf8Validation, iters) {
const dynamic obj = kLargeAsciiString;
folly::json::serialization_opts opts;
opts.validate_utf8 = true;
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(obj, opts);
}
}
......@@ -114,20 +125,10 @@ BENCHMARK(parseNormalString, iters) {
}
BENCHMARK(parseBigString, iters) {
const auto json = folly::to<std::string>('"', kLargeAsciiString, '"');
for (size_t i = 0; i < iters; ++i) {
parseJson("\""
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"\"");
parseJson(json);
}
}
......
......@@ -190,6 +190,53 @@ TEST(Json, JsonEscape) {
R"("\b\f\n\r\u0001\t\\\"/\u000b\u0007")");
}
TEST(Json, EscapeCornerCases) {
// The escaping logic uses some bitwise operations to determine
// which bytes need escaping 8 bytes at a time. Test that this logic
// is correct regardless of positions by planting 2 characters that
// may need escaping at each possible position and checking the
// result, for varying string lengths.
folly::json::serialization_opts opts;
opts.validate_utf8 = true;
std::string s;
std::string expected;
for (bool ascii : {true, false}) {
opts.encode_non_ascii = ascii;
for (size_t len = 2; len < 32; ++len) {
for (size_t i = 0; i < len; ++i) {
for (size_t j = 0; j < len; ++j) {
if (i == j) {
continue;
}
s.clear();
expected.clear();
expected.push_back('"');
for (size_t pos = 0; pos < len; ++pos) {
if (pos == i) {
s.push_back('\\');
expected.append("\\\\");
} else if (pos == j) {
s.append("\xe2\x82\xac");
expected.append(ascii ? "\\u20ac" : "\xe2\x82\xac");
} else {
s.push_back('x');
expected.push_back('x');
}
}
expected.push_back('"');
EXPECT_EQ(folly::json::serialize(s, opts), expected) << ascii;
}
}
}
}
}
TEST(Json, JsonNonAsciiEncoding) {
folly::json::serialization_opts opts;
opts.encode_non_ascii = true;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment