userver: userver/utils/filter_bloom.hpp Source File
Loading...
Searching...
No Matches
filter_bloom.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/filter_bloom.hpp
4/// @brief @copybrief utils::FilterBloom
5
6#include <array>
7#include <functional>
8#include <optional>
9#include <type_traits>
10#include <utility>
11
12#include <boost/container_hash/hash.hpp>
13
14#include <userver/utils/assert.hpp>
15#include <userver/utils/fixed_array.hpp>
16
17USERVER_NAMESPACE_BEGIN
18
19namespace utils {
20
21/// @ingroup userver_universal
22///
23/// @brief Space-efficient probabilistic data structure
24///
25/// @details Used to test whether a count number of a given element is
26/// smaller than a given threshold when a sequence of elements is given.
27/// As a generalized form of Bloom filter,
28/// false positive matches are possible, but false negatives are not.
29/// @param T the type of element that counts
30/// @param Counter the type of counter
31/// @param Hash1 the first callable hash struct
32/// @param Hash2 the second callable hash struct
33///
34/// Example:
35/// @snippet src/utils/filter_bloom_test.cpp Sample filter bloom usage
36template <typename T, typename Counter = unsigned, typename Hash1 = boost::hash<T>, typename Hash2 = std::hash<T>>
37class FilterBloom final {
38public:
39 /// @brief Constructs filter Bloom with the specified number of counters
40 /// @note If expected to increment n times is recommended to set counters_num
41 /// to 16 * n
42 explicit FilterBloom(std::size_t counters_num = 256, Hash1 hash_1 = Hash1{}, Hash2 hash_2 = Hash2{})
43 : counters_(counters_num, 0), hasher_1(std::move(hash_1)), hasher_2(std::move(hash_2)) {
44 UASSERT((!std::is_same_v<Hash1, Hash2>));
45 UASSERT((std::is_same_v<std::invoke_result_t<Hash1, const T&>, std::invoke_result_t<Hash2, const T&>>));
46 }
47
48 /// @brief Increments the smallest item counters
49 void Increment(const T& item);
50
51 /// @brief Returns the value of the smallest item counter
52 Counter Estimate(const T& item) const;
53
54 /// @brief Checks that all counters of the item have been incremented
55 bool Has(const T& item) const;
56
57 /// @brief Resets all counters
58 void Clear();
59
60private:
61 using HashedType = std::invoke_result_t<Hash1, const T&>;
62
63 inline uint64_t Coefficient(std::size_t step) const;
64 uint64_t GetHash(const HashedType& hashed_value_1, const HashedType& hashed_value_2, uint64_t coefficient) const;
65 Counter MinFrequency(const HashedType& hashed_value_1, const HashedType& hashed_value_2) const;
66
67 utils::FixedArray<Counter> counters_;
68 const Hash1 hasher_1;
69 const Hash2 hasher_2;
70
71 static constexpr std::size_t kHashFunctionsCount = 4;
72};
73
74template <typename T, typename Counter, typename Hash1, typename Hash2>
75inline uint64_t FilterBloom<T, Counter, Hash1, Hash2>::Coefficient(std::size_t step) const {
76 return 1 << step;
77}
78
79template <typename T, typename Counter, typename Hash1, typename Hash2>
80uint64_t FilterBloom<T, Counter, Hash1, Hash2>::GetHash(
81 const HashedType& hashed_value_1,
82 const HashedType& hashed_value_2,
83 uint64_t coefficient
84) const {
85 // the idea was taken from
86 // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
87 return hashed_value_1 + hashed_value_2 * coefficient;
88}
89
90template <typename T, typename Counter, typename Hash1, typename Hash2>
91Counter FilterBloom<T, Counter, Hash1, Hash2>::MinFrequency(
92 const HashedType& hashed_value_1,
93 const HashedType& hashed_value_2
94) const {
95 std::optional<Counter> min_count;
96 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
97 auto current_count = counters_[GetHash(hashed_value_1, hashed_value_2, Coefficient(step)) % counters_.size()];
98 if (!min_count.has_value() || min_count.value() > current_count) {
99 min_count = current_count;
100 }
101 }
102 return min_count.value();
103}
104
105template <typename T, typename Counter, typename Hash1, typename Hash2>
106void FilterBloom<T, Counter, Hash1, Hash2>::Increment(const T& item) {
107 auto hash_value_1 = hasher_1(item);
108 auto hash_value_2 = hasher_2(item);
109 Counter min_frequency = MinFrequency(hash_value_1, hash_value_2);
110
111 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
112 auto& current_count = counters_[GetHash(hash_value_1, hash_value_2, Coefficient(step)) % counters_.size()];
113 if (current_count == min_frequency) {
114 current_count++;
115 }
116 }
117}
118
119template <typename T, typename Counter, typename Hash1, typename Hash2>
120Counter FilterBloom<T, Counter, Hash1, Hash2>::Estimate(const T& item) const {
121 auto hash_value_1 = hasher_1(item);
122 auto hash_value_2 = hasher_2(item);
123 return MinFrequency(hash_value_1, hash_value_2);
124}
125
126template <typename T, typename Counter, typename Hash1, typename Hash2>
127bool FilterBloom<T, Counter, Hash1, Hash2>::Has(const T& item) const {
128 return Estimate(item) > 0;
129}
130
131template <typename T, typename Counter, typename Hash1, typename Hash2>
132void FilterBloom<T, Counter, Hash1, Hash2>::Clear() {
133 for (auto& counter : counters_) {
134 counter = 0;
135 }
136}
137
138} // namespace utils
139
140USERVER_NAMESPACE_END