Commit 67f7bb72 authored by Zbigniew Szymanski's avatar Zbigniew Szymanski Committed by Facebook Github Bot

Move internal `decodeUtf8` method from json.cpp to public util Unicode.h

Summary:
Moved decodeUtf8 -> folly::utf8ToCodePoint.
Implementation was not changed to make sure no bugs are introduced.

Reviewed By: yfeldblum

Differential Revision: D4372739

fbshipit-source-id: a015a9c47ece825e09e7c243fae454f21f99db80
parent 1d36d464
......@@ -15,6 +15,7 @@
*/
#include <folly/Unicode.h>
#include <folly/Conv.h>
namespace folly {
......@@ -48,6 +49,91 @@ std::string codePointToUtf8(char32_t cp) {
return result;
}
char32_t utf8ToCodePoint(
const unsigned char*& p,
const unsigned char* const e,
bool skipOnError) {
/* The following encodings are valid, except for the 5 and 6 byte
* combinations:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
auto skip = [&] { ++p; return U'\ufffd'; };
if (p >= e) {
if (skipOnError) return skip();
throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
}
unsigned char fst = *p;
if (!(fst & 0x80)) {
// trivial case
return *p++;
}
static const uint32_t bitMask[] = {
(1 << 7) - 1,
(1 << 11) - 1,
(1 << 16) - 1,
(1 << 21) - 1
};
// upper control bits are masked out later
uint32_t d = fst;
if ((fst & 0xC0) != 0xC0) {
if (skipOnError) return skip();
throw std::runtime_error(to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
}
fst <<= 1;
for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
unsigned char tmp = p[i];
if ((tmp & 0xC0) != 0x80) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
}
d = (d << 6) | (tmp & 0x3F);
fst <<= 1;
if (!(fst & 0x80)) {
d &= bitMask[i];
// overlong, could have been encoded with i bytes
if ((d & ~bitMask[i - 1]) == 0) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
}
// check for surrogates only needed for 3 bytes
if (i == 2) {
if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
}
}
p += i + 1;
return d;
}
}
if (skipOnError) return skip();
throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
}
//////////////////////////////////////////////////////////////////////
}
......@@ -31,6 +31,14 @@ namespace folly {
*/
std::string codePointToUtf8(char32_t cp);
/*
* Decode a single unicode code point from UTF-8 byte sequence.
*/
char32_t utf8ToCodePoint(
const unsigned char*& p,
const unsigned char* const e,
bool skipOnError);
//////////////////////////////////////////////////////////////////////
}
......@@ -33,90 +33,6 @@ namespace folly {
namespace json {
namespace {
char32_t decodeUtf8(
const unsigned char*& p,
const unsigned char* const e,
bool skipOnError) {
/* The following encodings are valid, except for the 5 and 6 byte
* combinations:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
auto skip = [&] { ++p; return U'\ufffd'; };
if (p >= e) {
if (skipOnError) return skip();
throw std::runtime_error("folly::decodeUtf8 empty/invalid string");
}
unsigned char fst = *p;
if (!(fst & 0x80)) {
// trivial case
return *p++;
}
static const uint32_t bitMask[] = {
(1 << 7) - 1,
(1 << 11) - 1,
(1 << 16) - 1,
(1 << 21) - 1
};
// upper control bits are masked out later
uint32_t d = fst;
if ((fst & 0xC0) != 0xC0) {
if (skipOnError) return skip();
throw std::runtime_error(to<std::string>("folly::decodeUtf8 i=0 d=", d));
}
fst <<= 1;
for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
unsigned char tmp = p[i];
if ((tmp & 0xC0) != 0x80) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp));
}
d = (d << 6) | (tmp & 0x3F);
fst <<= 1;
if (!(fst & 0x80)) {
d &= bitMask[i];
// overlong, could have been encoded with i bytes
if ((d & ~bitMask[i - 1]) == 0) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
}
// check for surrogates only needed for 3 bytes
if (i == 2) {
if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
if (skipOnError) return skip();
throw std::runtime_error(
to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
}
}
p += i + 1;
return d;
}
}
if (skipOnError) return skip();
throw std::runtime_error("folly::decodeUtf8 encoding length maxed out");
}
struct Printer {
explicit Printer(
std::string& out,
......@@ -716,7 +632,7 @@ void escapeString(
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8);
char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
if (opts.skip_invalid_utf8 && v == U'\ufffd') {
out.append(u8"\ufffd");
p = q;
......@@ -727,7 +643,7 @@ void escapeString(
if (opts.encode_non_ascii && (*p & 0x80)) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8);
char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
out.append("\\u");
out.push_back(hexDigit(uint8_t(v >> 12)));
out.push_back(hexDigit((v >> 8) & 0x0f));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment