Commit 6daec316 authored by Yedidya Feldblum's avatar Yedidya Feldblum Committed by facebook-github-bot-4

Extract SparseByteSet into its own module

Summary: [Folly] Extract `SparseByteSet` into its own module.

`SparseByteSet`, formerly `FastByteSet`, is actually a generic, fully standalone class. It does not need to be embedded in `Range.cpp`.

Reviewed By: @​bmaurer

Differential Revision: D2460180
parent 14a19db2
......@@ -120,6 +120,7 @@ nobase_follyinclude_HEADERS = \
experimental/StringKeyedUnorderedSet.h \
experimental/TestUtil.h \
experimental/TupleOps.h \
SparseByteSet.h \
FBString.h \
FBVector.h \
File.h \
......
......@@ -18,6 +18,7 @@
// @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
#include <folly/Range.h>
#include <folly/SparseByteSet.h>
#if FOLLY_HAVE_EMMINTRIN_H
#include <emmintrin.h> // __v16qi
......@@ -93,39 +94,13 @@ size_t qfind_first_byte_of_needles16(const StringPiece haystack,
}
#endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+
// Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
// of Computer Algorithms" (1974), but the best description is here:
// http://research.swtch.com/sparse
class FastByteSet {
public:
FastByteSet() : size_(0) { } // no init of arrays required!
inline void add(uint8_t i) {
if (!contains(i)) {
dense_[size_] = i;
sparse_[i] = size_;
size_++;
}
}
inline bool contains(uint8_t i) const {
DCHECK_LE(size_, 256);
return sparse_[i] < size_ && dense_[sparse_[i]] == i;
}
private:
uint16_t size_; // can't use uint8_t because it would overflow if all
// possible values were inserted.
uint8_t sparse_[256];
uint8_t dense_[256];
};
} // namespace
namespace detail {
size_t qfind_first_byte_of_byteset(const StringPiece haystack,
const StringPiece needles) {
FastByteSet s;
SparseByteSet s;
for (auto needle: needles) {
s.add(needle);
}
......
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FOLLY_FAST_BYTE_SET_H_
#define FOLLY_FAST_BYTE_SET_H_
#include <cstdint>
#include <glog/logging.h>
namespace folly {
/***
* SparseByteSet
*
* A special-purpose data structure representing an insert-only set of bytes.
* May have better performance than std::bitset<256>, depending on workload.
*
* Operations:
* - add(byte)
* - contains(byte)
*
* Performance:
* - The entire capacity of the set is inline; the set never allocates.
* - The constructor zeros only the first two bytes of the object.
* - add and contains both run in constant time w.r.t. the size of the set.
* Constant time - not amortized constant - and with small constant factor.
*
* This data structure is ideal for on-stack use.
*
* Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
* of Computer Algorithms" (1974), but the best description is here:
* http://research.swtch.com/sparse
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.7319
*/
class SparseByteSet {
public:
// There are this many possible values:
static constexpr uint16_t kCapacity = 256;
// No init of byte-arrays required!
SparseByteSet() : size_(0) { }
/***
* add(byte)
*
* O(1), non-amortized.
*/
inline bool add(uint8_t i) {
bool r = !contains(i);
if (r) {
DCHECK_LT(size_, kCapacity);
dense_[size_] = i;
sparse_[i] = size_;
size_++;
}
return r;
}
/***
* contains(byte)
*
* O(1), non-amortized.
*/
inline bool contains(uint8_t i) const {
return sparse_[i] < size_ && dense_[sparse_[i]] == i;
}
private:
uint16_t size_; // can't use uint8_t because it would overflow if all
// possible values were inserted.
uint8_t sparse_[kCapacity];
uint8_t dense_[kCapacity];
};
}
#endif
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/***
* A benchmark comparing SparseByteSet to bitset<256> and bool[256].
*/
#include <bitset>
#include <random>
#include <vector>
#include <gflags/gflags.h>
#include <folly/Benchmark.h>
#include <folly/Format.h>
#include <folly/SparseByteSet.h>
using namespace std;
using namespace folly;
namespace {
// Interface-identical to SparseByteSet. So that we can do compile-time
// polymorphism.
class BitSetWrapper {
public:
inline bool add(uint8_t i) {
auto r = !contains(i);
if (r) {
rep_[i] = true;
}
return r;
}
inline bool contains(uint8_t i) {
return rep_[i];
}
private:
bitset<256> rep_;
};
class BoolArraySet {
public:
BoolArraySet() {
memset(rep_, 0, sizeof(rep_));
}
inline bool add(uint8_t i) {
auto r = !contains(i);
if (r) {
rep_[i] = true;
}
return r;
}
inline bool contains(uint8_t i) {
return rep_[i];
}
private:
bool rep_[256];
};
template <typename Coll>
void rand_bench(int iters, size_t size_add, size_t size_contains) {
BenchmarkSuspender braces;
vector<uint8_t> seq_add;
vector<uint8_t> seq_contains;
mt19937 rng;
uniform_int_distribution<uint8_t> dist;
for (size_t i = 0; i < size_add; ++i) {
seq_add.push_back(dist(rng));
}
for (size_t i = 0; i < size_contains; ++i) {
seq_contains.push_back(dist(rng));
}
braces.dismissing([&] {
while (iters--) {
Coll coll;
for (auto b : seq_add) {
coll.add(b);
}
bool q {};
for (auto b : seq_contains) {
q ^= coll.contains(b);
}
doNotOptimizeAway(q);
}
});
}
void setup_rand_bench() {
vector<pair<size_t, size_t>> rand_bench_params = {
{4, 4},
{4, 16},
{4, 64},
{4, 256},
{16, 4},
{16, 16},
{16, 64},
{16, 256},
{64, 4},
{64, 16},
{64, 64},
{64, 256},
{256, 4},
{256, 16},
{256, 64},
{256, 256},
};
for (auto kvp : rand_bench_params) {
size_t size_add, size_contains;
tie(size_add, size_contains) = kvp;
addBenchmark(
__FILE__,
sformat("bitset_rand_bench({}, {})",
size_add, size_contains).c_str(),
[=](int iters) {
rand_bench<BitSetWrapper>(iters, size_add, size_contains);
return iters;
});
addBenchmark(
__FILE__,
sformat("\%bool_array_set_rand_bench({}, {})",
size_add, size_contains).c_str(),
[=](int iters) {
rand_bench<BoolArraySet>(iters, size_add, size_contains);
return iters;
});
addBenchmark(
__FILE__,
sformat("\%sparse_byte_set_rand_bench({}, {})",
size_add, size_contains).c_str(),
[=](int iters) {
rand_bench<SparseByteSet>(iters, size_add, size_contains);
return iters;
});
addBenchmark(
__FILE__,
"-",
[](int) { return 0; });
}
}
}
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
setup_rand_bench();
runBenchmarks();
return 0;
}
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdint>
#include <limits>
#include <random>
#include <set>
#include <folly/SparseByteSet.h>
#include <gtest/gtest.h>
using namespace std;
using namespace folly;
namespace {
class SparseByteSetTest : public testing::Test {
protected:
using lims = numeric_limits<uint8_t>;
SparseByteSet s;
};
}
TEST_F(SparseByteSetTest, empty) {
for (auto c = lims::min(); c < lims::max(); ++c) {
EXPECT_FALSE(s.contains(c));
}
}
TEST_F(SparseByteSetTest, each) {
for (auto c = lims::min(); c < lims::max(); ++c) {
EXPECT_TRUE(s.add(c));
EXPECT_TRUE(s.contains(c));
}
for (auto c = lims::min(); c < lims::max(); ++c) {
EXPECT_FALSE(s.add(c));
EXPECT_TRUE(s.contains(c));
}
}
TEST_F(SparseByteSetTest, each_random) {
mt19937 rng;
uniform_int_distribution<uint8_t> dist;
set<uint8_t> added;
while (added.size() <= lims::max()) {
auto c = dist(rng);
EXPECT_EQ(added.count(c), s.contains(c));
EXPECT_EQ(!added.count(c), s.add(c));
added.insert(c);
EXPECT_TRUE(added.count(c)); // sanity
EXPECT_TRUE(s.contains(c));
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment