make folly::toJson retain non-ascii chars if encode_non_ascii is disabled

Summary: folly::toJson as demonstrated by the test cases was wrongly encoding utf8 strings. specifically, a utf8 char made up of x bytes was encodeded into x unicode chars. for example, the char: \u2665 which is made of 3 bytes: \xe2\x99\xa5 was encoded correctly when using encode_non_ascii = true: "\u2665" but when encode_non_ascii = false, the json value was wrongly set as: "\u00e2\u0099\u00a5" because we use an signed char that wrongly detects non-readable chars with ascii value > 127 as control chars with ascii value < 32 (\t, \n, etc.) Test Plan: run the test Reviewed By: delong.j@fb.com FB internal diff: D612782

make folly::toJson retain non-ascii chars if encode_non_ascii is disabled
Summary: folly::toJson as demonstrated by the test cases was wrongly encoding utf8 strings. specifically, a utf8 char made up of x bytes was encodeded into x unicode chars. for example, the char: \u2665 which is made of 3 bytes: \xe2\x99\xa5 was encoded correctly when using encode_non_ascii = true: "\u2665" but when encode_non_ascii = false, the json value was wrongly set as: "\u00e2\u0099\u00a5" because we use an signed char that wrongly detects non-readable chars with ascii value > 127 as control chars with ascii value < 32 (\t, \n, etc.) Test Plan: run the test Reviewed By: delong.j@fb.com FB internal diff: D612782
e18efdc4 · Hari Manikarnika · Jordan DeLong · 6e230a3f · e18efdc4 · e18efdc4
Commit e18efdc4 authored Oct 26, 2012 by Hari Manikarnika Committed by Jordan DeLong Dec 16, 2012
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 11 deletions

folly/json.cpp folly/json.cpp +8 -6

folly/test/JsonTest.cpp folly/test/JsonTest.cpp +55 -5

No files found.
--- a/folly/json.cpp
+++ b/folly/json.cpp
@@ -30,7 +30,7 @@ namespace folly {
 namespace json {
 namespace {

-char32_t decodeUtf8(const char*& p, const char* const e) {
+char32_t decodeUtf8(const unsigned char*& p, const unsigned char* const e) {
  /* The following encodings are valid, except for the 5 and 6 byte
   * combinations:
   * 0xxxxxxx
@@ -115,9 +115,9 @@ void escapeString(StringPiece input,
  out.reserve(out.size() + input.size() + 2);
  out.push_back('\"');

-  const char* p = input.begin();
-  const char* q = input.begin();
-  const char* const e = input.end();
+  auto* p = reinterpret_cast<const unsigned char*>(input.begin());
+  auto* q = reinterpret_cast<const unsigned char*>(input.begin());
+  auto* e = reinterpret_cast<const unsigned char*>(input.end());

  while (p < e) {
    // Since non-ascii encoding inherently does utf8 validation
@@ -139,6 +139,8 @@ void escapeString(StringPiece input,
    }

    if (opts.encode_non_ascii && (*p & 0x80)) {
+      // note that this if condition captures utf8 chars
+      // with value > 127, so size > 1 byte
      char32_t v = decodeUtf8(p, e);
      out.append("\\u");
      out.push_back(hexDigit(v >> 12));
@@ -156,8 +158,8 @@ void escapeString(StringPiece input,
      case '\r': out.append("\\r"); p++; break;
      case '\t': out.append("\\t"); p++; break;
      default:
-        // note that this if condition captures both control characters
-        // and extended ascii characters
+        // note that this if condition captures non readable chars
+        // with value < 32, so size = 1 byte (e.g control chars).
        out.append("\\u00");
        out.push_back(hexDigit((*p & 0xf0) >> 4));
        out.push_back(hexDigit(*p & 0xf));

--- a/folly/test/JsonTest.cpp
+++ b/folly/test/JsonTest.cpp
@@ -246,20 +246,70 @@ TEST(Json, JsonNonAsciiEncoding) {
  EXPECT_ANY_THROW(folly::json::serialize("\xed\xaf\xbf\xed\xbf\xbf", opts));
 }

+TEST(Json, UTF8Retention) {
+
+  // test retention with valid utf8 strings
+  folly::fbstring input = "\u2665";
+  folly::fbstring jsonInput = folly::toJson(input);
+  folly::fbstring output = folly::parseJson(jsonInput).asString();
+  folly::fbstring jsonOutput = folly::toJson(output);
+
+  LOG(INFO) << "input: " << input
+            <<" => json: " << jsonInput;
+  LOG(INFO) << "output: " << output
+            <<" => json: " << jsonOutput;
+
+  EXPECT_EQ(input, output);
+  EXPECT_EQ(jsonInput, jsonOutput);
+
+  // test retention with invalid utf8 - note that non-ascii chars are retained
+  // as is, and no unicode encoding is attempted so no exception is thrown.
+  EXPECT_EQ(
+    folly::toJson("a\xe0\xa0\x80z\xc0\x80"),
+    "\"a\xe0\xa0\x80z\xc0\x80\""
+  );
+}
+
+TEST(Json, UTF8EncodeNonAsciiRetention) {
+
+  folly::json::serialization_opts opts;
+  opts.encode_non_ascii = true;
+
+  // test encode_non_ascii valid utf8 strings
+  folly::fbstring input = "\u2665";
+  folly::fbstring jsonInput = folly::json::serialize(input, opts);
+  folly::fbstring output = folly::parseJson(jsonInput).asString();
+  folly::fbstring jsonOutput = folly::json::serialize(output, opts);
+
+  LOG(INFO) << "input: " << input
+            <<" => json: " << jsonInput;
+  LOG(INFO) << "output: " << output
+            <<" => json: " << jsonOutput;
+
+  EXPECT_EQ(input, output);
+  EXPECT_EQ(jsonInput, jsonOutput);
+
+  // test encode_non_ascii with invalid utf8 - note that an attempt to encode
+  // non-ascii to unicode will result is a utf8 validation and throw exceptions.
+  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
+  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
+}
+
 TEST(Json, UTF8Validation) {
  folly::json::serialization_opts opts;
  opts.validate_utf8 = true;

-  // valid utf8 strings
-  EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), R"("a\u00c2\u0080z")");
+  // test validate_utf8 valid utf8 strings - note that we only validate the
+  // for utf8 but don't encode non-ascii to unicode so they are retained as is.
+  EXPECT_EQ(folly::json::serialize("a\xc2\x80z", opts), "\"a\xc2\x80z\"");
  EXPECT_EQ(
    folly::json::serialize("a\xe0\xa0\x80z", opts),
-    R"("a\u00e0\u00a0\u0080z")");
+    "\"a\xe0\xa0\x80z\"");
  EXPECT_EQ(
    folly::json::serialize("a\xe0\xa0\x80m\xc2\x80z", opts),
-    R"("a\u00e0\u00a0\u0080m\u00c2\u0080z")");
+    "\"a\xe0\xa0\x80m\xc2\x80z\"");

-  // test with invalid utf8
+  // test validate_utf8 with invalid utf8
  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xc0\x80", opts));
  EXPECT_ANY_THROW(folly::json::serialize("a\xe0\xa0\x80z\xe0\x80\x80", opts));
 }