userver: userver/utils/filter_bloom.hpp Source File
Loading...
Searching...
No Matches
filter_bloom.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/filter_bloom.hpp
4/// @brief @copybrief utils::FilterBloom
5
6#include <array>
7#include <functional>
8#include <optional>
9#include <type_traits>
10#include <utility>
11
12#include <boost/container_hash/hash.hpp>
13
14#include <userver/utils/assert.hpp>
15#include <userver/utils/fixed_array.hpp>
16
17USERVER_NAMESPACE_BEGIN
18
19namespace utils {
20
21/// @ingroup userver_universal
22///
23/// @brief Space-efficient probabilistic data structure
24///
25/// @details Used to test whether a count number of a given element is
26/// smaller than a given threshold when a sequence of elements is given.
27/// As a generalized form of Bloom filter,
28/// false positive matches are possible, but false negatives are not.
29/// @param T the type of element that counts
30/// @param Counter the type of counter
31/// @param Hash1 the first callable hash struct
32/// @param Hash2 the second callable hash struct
33///
34/// Example:
35/// @snippet src/utils/filter_bloom_test.cpp Sample filter bloom usage
36template <typename T, typename Counter = unsigned, typename Hash1 = boost::hash<T>, typename Hash2 = std::hash<T>>
37class FilterBloom final {
38public:
39 /// @brief Constructs filter Bloom with the specified number of counters
40 /// @note If expected to increment n times is recommended to set counters_num
41 /// to 16 * n
42 explicit FilterBloom(std::size_t counters_num = 256, Hash1 hash_1 = Hash1{}, Hash2 hash_2 = Hash2{})
43 : counters_(counters_num, 0),
44 hasher_1_(std::move(hash_1)),
45 hasher_2_(std::move(hash_2))
46 {
47 UASSERT((!std::is_same_v<Hash1, Hash2>));
48 UASSERT((std::is_same_v<std::invoke_result_t<Hash1, const T&>, std::invoke_result_t<Hash2, const T&>>));
49 }
50
51 /// @brief Increments the smallest item counters
52 void Increment(const T& item);
53
54 /// @brief Returns the value of the smallest item counter
55 Counter Estimate(const T& item) const;
56
57 /// @brief Checks that all counters of the item have been incremented
58 bool Has(const T& item) const;
59
60 /// @brief Resets all counters
61 void Clear();
62
63private:
64 using HashedType = std::invoke_result_t<Hash1, const T&>;
65
66 inline uint64_t Coefficient(std::size_t step) const;
67 uint64_t GetHash(const HashedType& hashed_value_1, const HashedType& hashed_value_2, uint64_t coefficient) const;
68 Counter MinFrequency(const HashedType& hashed_value_1, const HashedType& hashed_value_2) const;
69
70 utils::FixedArray<Counter> counters_;
71 const Hash1 hasher_1_;
72 const Hash2 hasher_2_;
73
74 static constexpr std::size_t kHashFunctionsCount = 4;
75};
76
77template <typename T, typename Counter, typename Hash1, typename Hash2>
78inline uint64_t FilterBloom<T, Counter, Hash1, Hash2>::Coefficient(std::size_t step) const {
79 return 1 << step;
80}
81
82template <typename T, typename Counter, typename Hash1, typename Hash2>
83uint64_t FilterBloom<
84 T,
85 Counter,
86 Hash1,
87 Hash2>::GetHash(const HashedType& hashed_value_1, const HashedType& hashed_value_2, uint64_t coefficient) const {
88 // the idea was taken from
89 // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
90 return hashed_value_1 + hashed_value_2 * coefficient;
91}
92
93template <typename T, typename Counter, typename Hash1, typename Hash2>
94Counter FilterBloom<
95 T,
96 Counter,
97 Hash1,
98 Hash2>::MinFrequency(const HashedType& hashed_value_1, const HashedType& hashed_value_2) const {
99 std::optional<Counter> min_count;
100 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
101 auto current_count = counters_[GetHash(hashed_value_1, hashed_value_2, Coefficient(step)) % counters_.size()];
102 if (!min_count.has_value() || min_count.value() > current_count) {
103 min_count = current_count;
104 }
105 }
106 return min_count.value();
107}
108
109template <typename T, typename Counter, typename Hash1, typename Hash2>
110void FilterBloom<T, Counter, Hash1, Hash2>::Increment(const T& item) {
111 auto hash_value_1 = hasher_1_(item);
112 auto hash_value_2 = hasher_2_(item);
113 Counter min_frequency = MinFrequency(hash_value_1, hash_value_2);
114
115 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
116 auto& current_count = counters_[GetHash(hash_value_1, hash_value_2, Coefficient(step)) % counters_.size()];
117 if (current_count == min_frequency) {
118 current_count++;
119 }
120 }
121}
122
123template <typename T, typename Counter, typename Hash1, typename Hash2>
124Counter FilterBloom<T, Counter, Hash1, Hash2>::Estimate(const T& item) const {
125 auto hash_value_1 = hasher_1_(item);
126 auto hash_value_2 = hasher_2_(item);
127 return MinFrequency(hash_value_1, hash_value_2);
128}
129
130template <typename T, typename Counter, typename Hash1, typename Hash2>
131bool FilterBloom<T, Counter, Hash1, Hash2>::Has(const T& item) const {
132 return Estimate(item) > 0;
133}
134
135template <typename T, typename Counter, typename Hash1, typename Hash2>
136void FilterBloom<T, Counter, Hash1, Hash2>::Clear() {
137 for (auto& counter : counters_) {
138 counter = 0;
139 }
140}
141
142} // namespace utils
143
144USERVER_NAMESPACE_END