Commit 98889326 authored by Song Zhou's avatar Song Zhou Committed by Facebook Github Bot

Added a new variant of byLine to keep the delimiter

Summary: new method byLineFull will not trim the delimiter so that consumers can check if final line is ended up with delimiter or not.

Reviewed By: philippv, yfeldblum

Differential Revision: D5085371

fbshipit-source-id: 5045127ee11d008e3cd7d13d33bffad280fe0a7e
parent 7e04d475
......@@ -120,6 +120,15 @@ class FileWriter : public Operator<FileWriter> {
std::unique_ptr<IOBuf> buffer_;
};
inline auto byLineImpl(File file, char delim, bool keepDelimiter)
-> decltype(fromFile(std::move(file))
| eachAs<StringPiece>()
| resplit(delim, keepDelimiter)) {
return fromFile(std::move(file))
| eachAs<StringPiece>()
| resplit(delim, keepDelimiter);
}
} // !detail
/**
......@@ -127,13 +136,24 @@ class FileWriter : public Operator<FileWriter> {
* Note: This produces StringPieces which reference temporary strings which are
* only valid during iteration.
*/
inline auto byLineFull(File file, char delim = '\n')
-> decltype(detail::byLineImpl(std::move(file), delim, true)) {
return detail::byLineImpl(std::move(file), delim, true);
}
inline auto byLineFull(int fd, char delim = '\n')
-> decltype(byLineFull(File(fd), delim)) {
return byLineFull(File(fd), delim);
}
inline auto byLineFull(const char* f, char delim = '\n')
-> decltype(byLineFull(File(f), delim)) {
return byLineFull(File(f), delim);
}
inline auto byLine(File file, char delim = '\n')
-> decltype(fromFile(std::move(file))
| eachAs<StringPiece>()
| resplit(delim)) {
return fromFile(std::move(file))
| eachAs<StringPiece>()
| resplit(delim);
-> decltype(detail::byLineImpl(std::move(file), delim, false)) {
return detail::byLineImpl(std::move(file), delim, false);
}
inline auto byLine(int fd, char delim = '\n')
......@@ -141,5 +161,4 @@ inline auto byLine(int fd, char delim = '\n')
inline auto byLine(const char* f, char delim = '\n')
-> decltype(byLine(File(f), delim)) { return byLine(File(f), delim); }
}} // !folly::gen
......@@ -213,16 +213,23 @@ namespace detail {
class StringResplitter : public Operator<StringResplitter> {
char delimiter_;
bool keepDelimiter_;
public:
explicit StringResplitter(char delimiter) : delimiter_(delimiter) { }
explicit StringResplitter(char delimiter, bool keepDelimiter = false)
: delimiter_(delimiter), keepDelimiter_(keepDelimiter) {}
template <class Source>
class Generator : public GenImpl<StringPiece, Generator<Source>> {
Source source_;
char delimiter_;
bool keepDelimiter_;
public:
Generator(Source source, char delimiter)
: source_(std::move(source)), delimiter_(delimiter) { }
Generator(Source source, char delimiter, bool keepDelimiter)
: source_(std::move(source)),
delimiter_(delimiter),
keepDelimiter_(keepDelimiter) {}
template <class Body>
bool apply(Body&& body) const {
......@@ -236,7 +243,9 @@ class StringResplitter : public Operator<StringResplitter> {
if (s.back() != this->delimiter_) {
return body(s);
}
s.pop_back(); // Remove the 1-character delimiter
if (!keepDelimiter_) {
s.pop_back(); // Remove the 1-character delimiter
}
return body(s);
});
if (!source_.apply(splitter)) {
......@@ -252,14 +261,14 @@ class StringResplitter : public Operator<StringResplitter> {
class Value,
class Gen = Generator<Source>>
Gen compose(GenImpl<Value, Source>&& source) const {
return Gen(std::move(source.self()), delimiter_);
return Gen(std::move(source.self()), delimiter_, keepDelimiter_);
}
template<class Source,
class Value,
class Gen = Generator<Source>>
Gen compose(const GenImpl<Value, Source>& source) const {
return Gen(source.self(), delimiter_);
return Gen(source.self(), delimiter_, keepDelimiter_);
}
};
......
......@@ -54,9 +54,9 @@ class SplitTo;
*/
// make this a template so we don't require StringResplitter to be complete
// until use
template <class S=detail::StringResplitter>
S resplit(char delimiter) {
return S(delimiter);
template <class S = detail::StringResplitter>
S resplit(char delimiter, bool keepDelimiter = false) {
return S(delimiter, keepDelimiter);
}
template <class S = detail::SplitStringSource<char>>
......
......@@ -16,6 +16,7 @@
#include <string>
#include <vector>
#include <folly/Array.h>
#include <folly/File.h>
#include <folly/Range.h>
#include <folly/experimental/TestUtil.h>
......@@ -56,7 +57,34 @@ TEST(FileGen, ByLine) {
}
}
class FileGenBufferedTest : public ::testing::TestWithParam<int> { };
TEST(FileGen, ByLineFull) {
auto cases = std::vector<std::string> {
stripLeftMargin(R"(
Hello world
This is the second line
a few empty lines above
incomplete last line)"),
"complete last line\n",
"\n",
""};
for (auto& lines : cases) {
test::TemporaryFile file("ByLineFull");
EXPECT_EQ(lines.size(), write(file.fd(), lines.data(), lines.size()));
auto found =
byLineFull(file.path().string().c_str()) | unsplit<std::string>("");
EXPECT_EQ(lines, found);
}
}
class FileGenBufferedTest : public ::testing::TestWithParam<int> {};
TEST_P(FileGenBufferedTest, FileWriter) {
size_t bufferSize = GetParam();
......
......@@ -260,6 +260,30 @@ TEST(StringGen, Resplit) {
}
}
TEST(StringGen, ResplitKeepDelimiter) {
auto collect = eachTo<std::string>() | as<vector>();
{
auto pieces =
from({"hello,, world, goodbye, meow"}) | resplit(',', true) | collect;
ASSERT_EQ(5, pieces.size());
EXPECT_EQ("hello,", pieces[0]);
EXPECT_EQ(",", pieces[1]);
EXPECT_EQ(" world,", pieces[2]);
EXPECT_EQ(" goodbye,", pieces[3]);
EXPECT_EQ(" meow", pieces[4]);
}
{
auto pieces = from({"hel", "lo,", ", world", ", goodbye, m", "eow"}) |
resplit(',', true) | collect;
ASSERT_EQ(5, pieces.size());
EXPECT_EQ("hello,", pieces[0]);
EXPECT_EQ(",", pieces[1]);
EXPECT_EQ(" world,", pieces[2]);
EXPECT_EQ(" goodbye,", pieces[3]);
EXPECT_EQ(" meow", pieces[4]);
}
}
void checkResplitMaxLength(vector<string> ins,
char delim,
uint64_t maxLength,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment