Commit d3fe49a0 authored by Sargun Vohra's avatar Sargun Vohra Committed by Facebook Github Bot

Add groupByAdjacent operator to folly::gen

Summary:
I find myself wanting something like this pretty often, so I tried my hand at adding it myself.

The `groupByAdjacent` operator creates groups bounded wherever the selector changes. It's especially useful for processing sources that've already been sorted on the selector, such as from a database query.

Given the following source sequence with keys `A` and `B`:

```
[A1, A2, A3, B1, B2, B3, A4, A5, B4, B5]
```

a regular `groupBy` would return something like:

```
[A:[A1, A2, A3, A4, A5], B:[B1, B2, B3, B4, B5]]
```

while this `groupByAdjacent` would return:

```
[A:[A1, A2, A3], B:[B1, B2, B3], A:[A4, A5], B:[B4, B5]]
```

Given a source where the items are presorted by selector, `groupByAdjacent` should behave identically to `groupBy`, except that `groupByAdjacent` supports infinite sources since it doesn't need to collect the entire source in memory before creating any output.

Reviewed By: yfeldblum

Differential Revision: D9475326

fbshipit-source-id: 1c8db3abadce5e68394e5fa38bf4bee0b413a03f
parent 647220dd
...@@ -1169,6 +1169,107 @@ class GroupBy : public Operator<GroupBy<Selector>> { ...@@ -1169,6 +1169,107 @@ class GroupBy : public Operator<GroupBy<Selector>> {
} }
}; };
/**
* GroupByAdjacent - Group adjacent values by a given key selector, producing a
* sequence of groups. This differs from GroupBy in that only contiguous sets
* of values with the same key are considered part of the same group. Unlike
* GroupBy, this can be used on infinite sequences.
*
* This type is usually used through the 'groupByAdjacent' helper function:
*
* auto tens
* = seq(0)
* | groupByAdjacent([](int i){ return (i / 10) % 2; })
*
* This example results in a list like [ 0:[0-9], 1:[10-19], 0:[20-29], ... ]
*/
template <class Selector>
class GroupByAdjacent : public Operator<GroupByAdjacent<Selector>> {
Selector selector_;
public:
GroupByAdjacent() {}
explicit GroupByAdjacent(Selector selector)
: selector_(std::move(selector)) {}
template <
class Value,
class Source,
class ValueDecayed = typename std::decay<Value>::type,
class Key = invoke_result_t<Selector, Value>,
class KeyDecayed = typename std::decay<Key>::type>
class Generator
: public GenImpl<
Group<KeyDecayed, ValueDecayed>&&,
Generator<Value, Source, ValueDecayed, Key, KeyDecayed>> {
Source source_;
Selector selector_;
public:
Generator(Source source, Selector selector)
: source_(std::move(source)), selector_(std::move(selector)) {}
typedef Group<KeyDecayed, ValueDecayed> GroupType;
template <class Handler>
bool apply(Handler&& handler) const {
Optional<KeyDecayed> key = none;
typename GroupType::VectorType values;
bool result = source_.apply([&](Value value) mutable {
KeyDecayed newKey = selector_(value);
// start the first group
if (!key.hasValue()) {
key.emplace(newKey);
}
if (key == newKey) {
// grow the current group
values.push_back(value);
} else {
// flush the current group
GroupType group(key.value(), std::move(values));
if (!handler(std::move(group))) {
return false;
}
// start a new group
key.emplace(newKey);
values.clear();
values.push_back(value);
}
return true;
});
if (!result) {
return false;
}
if (!key.hasValue()) {
return true;
}
// flush the final group
GroupType group(key.value(), std::move(values));
return handler(std::move(group));
}
static constexpr bool infinite = Source::infinite;
};
template <class Source, class Value, class Gen = Generator<Value, Source>>
Gen compose(GenImpl<Value, Source>&& source) const {
return Gen(std::move(source.self()), selector_);
}
template <class Source, class Value, class Gen = Generator<Value, Source>>
Gen compose(const GenImpl<Value, Source>& source) const {
return Gen(source.self(), selector_);
}
};
/* /*
* TypeAssertion - For verifying the exact type of the value produced by a * TypeAssertion - For verifying the exact type of the value produced by a
* generator. Useful for testing and debugging, and acts as a no-op at runtime. * generator. Useful for testing and debugging, and acts as a no-op at runtime.
......
...@@ -348,6 +348,9 @@ class Order; ...@@ -348,6 +348,9 @@ class Order;
template <class Selector> template <class Selector>
class GroupBy; class GroupBy;
template <class Selector>
class GroupByAdjacent;
template <class Selector> template <class Selector>
class Distinct; class Distinct;
...@@ -686,6 +689,13 @@ GroupBy groupBy(Selector selector = Selector()) { ...@@ -686,6 +689,13 @@ GroupBy groupBy(Selector selector = Selector()) {
return GroupBy(std::move(selector)); return GroupBy(std::move(selector));
} }
template <
class Selector = Identity,
class GroupByAdjacent = detail::GroupByAdjacent<Selector>>
GroupByAdjacent groupByAdjacent(Selector selector = Selector()) {
return GroupByAdjacent(std::move(selector));
}
template < template <
class Selector = Identity, class Selector = Identity,
class Distinct = detail::Distinct<Selector>> class Distinct = detail::Distinct<Selector>>
......
...@@ -1336,6 +1336,25 @@ TEST(Gen, GroupBy) { ...@@ -1336,6 +1336,25 @@ TEST(Gen, GroupBy) {
| as<vector>()); | as<vector>());
} }
TEST(Gen, GroupByAdjacent) {
vector<string> finite{"a", "b", "cc", "dd", "ee", "fff", "g", "hhh"};
vector<vector<string>> finiteGroups{
{"a", "b"}, {"cc", "dd", "ee"}, {"fff"}, {"g"}, {"hhh"}};
EXPECT_EQ(
finiteGroups,
from(finite) |
groupByAdjacent([](const string& str) { return str.size(); }) |
mapOp(as<vector>()) | as<vector>());
auto infinite = seq(0);
vector<vector<int>> infiniteGroups{
{0, 1, 2, 3, 4}, {5, 6, 7, 8, 9}, {10, 11, 12, 13, 14}};
EXPECT_EQ(
infiniteGroups,
infinite | groupByAdjacent([](const int& i) { return (i % 10) < 5; }) |
take(3) | mapOp(as<vector>()) | as<vector>());
}
TEST(Gen, Unwrap) { TEST(Gen, Unwrap) {
Optional<int> o(4); Optional<int> o(4);
Optional<int> e; Optional<int> e;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment