userver: userver/utils/filter_bloom.hpp Source File
Loading...
Searching...
No Matches
filter_bloom.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/filter_bloom.hpp
4/// @brief @copybrief utils::FilterBloom
5
6#include <array>
7#include <functional>
8#include <optional>
9#include <type_traits>
10#include <utility>
11
12#include <boost/container_hash/hash.hpp>
13
14#include <userver/utils/assert.hpp>
15#include <userver/utils/fixed_array.hpp>
16
17USERVER_NAMESPACE_BEGIN
18
19namespace utils {
20
21/// @ingroup userver_universal
22///
23/// @brief Space-efficient probabilistic data structure
24///
25/// @details Used to test whether a count number of a given element is
26/// smaller than a given threshold when a sequence of elements is given.
27/// As a generalized form of Bloom filter,
28/// false positive matches are possible, but false negatives are not.
29/// @param T the type of element that counts
30/// @param Counter the type of counter
31/// @param Hash1 the first callable hash struct
32/// @param Hash2 the second callable hash struct
33///
34/// Example:
35/// @snippet src/utils/filter_bloom_test.cpp Sample filter bloom usage
36template <typename T, typename Counter = unsigned,
37 typename Hash1 = boost::hash<T>, typename Hash2 = std::hash<T>>
38class FilterBloom final {
39 public:
40 /// @brief Constructs filter Bloom with the specified number of counters
41 /// @note If expected to increment n times is recommended to set counters_num
42 /// to 16 * n
43 explicit FilterBloom(std::size_t counters_num = 256, Hash1 hash_1 = Hash1{},
44 Hash2 hash_2 = Hash2{})
45 : counters_(counters_num, 0),
46 hasher_1(std::move(hash_1)),
47 hasher_2(std::move(hash_2)) {
48 UASSERT((!std::is_same_v<Hash1, Hash2>));
49 UASSERT((std::is_same_v<std::invoke_result_t<Hash1, const T&>,
50 std::invoke_result_t<Hash2, const T&>>));
51 }
52
53 /// @brief Increments the smallest item counters
54 void Increment(const T& item);
55
56 /// @brief Returns the value of the smallest item counter
57 Counter Estimate(const T& item) const;
58
59 /// @brief Checks that all counters of the item have been incremented
60 bool Has(const T& item) const;
61
62 /// @brief Resets all counters
63 void Clear();
64
65 private:
66 using HashedType = std::invoke_result_t<Hash1, const T&>;
67
68 inline uint64_t Coefficient(std::size_t step) const;
69 uint64_t GetHash(const HashedType& hashed_value_1,
70 const HashedType& hashed_value_2,
71 uint64_t coefficient) const;
72 Counter MinFrequency(const HashedType& hashed_value_1,
73 const HashedType& hashed_value_2) const;
74
75 utils::FixedArray<Counter> counters_;
76 const Hash1 hasher_1;
77 const Hash2 hasher_2;
78
79 static constexpr std::size_t kHashFunctionsCount = 4;
80};
81
82template <typename T, typename Counter, typename Hash1, typename Hash2>
83inline uint64_t FilterBloom<T, Counter, Hash1, Hash2>::Coefficient(
84 std::size_t step) const {
85 return 1 << step;
86}
87
88template <typename T, typename Counter, typename Hash1, typename Hash2>
89uint64_t FilterBloom<T, Counter, Hash1, Hash2>::GetHash(
90 const HashedType& hashed_value_1, const HashedType& hashed_value_2,
91 uint64_t coefficient) const {
92 // the idea was taken from
93 // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
94 return hashed_value_1 + hashed_value_2 * coefficient;
95}
96
97template <typename T, typename Counter, typename Hash1, typename Hash2>
98Counter FilterBloom<T, Counter, Hash1, Hash2>::MinFrequency(
99 const HashedType& hashed_value_1, const HashedType& hashed_value_2) const {
100 std::optional<Counter> min_count;
101 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
102 auto current_count =
103 counters_[GetHash(hashed_value_1, hashed_value_2, Coefficient(step)) %
104 counters_.size()];
105 if (!min_count.has_value() || min_count.value() > current_count) {
106 min_count = current_count;
107 }
108 }
109 return min_count.value();
110}
111
112template <typename T, typename Counter, typename Hash1, typename Hash2>
113void FilterBloom<T, Counter, Hash1, Hash2>::Increment(const T& item) {
114 auto hash_value_1 = hasher_1(item);
115 auto hash_value_2 = hasher_2(item);
116 Counter min_frequency = MinFrequency(hash_value_1, hash_value_2);
117
118 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
119 auto& current_count =
120 counters_[GetHash(hash_value_1, hash_value_2, Coefficient(step)) %
121 counters_.size()];
122 if (current_count == min_frequency) {
123 current_count++;
124 }
125 }
126}
127
128template <typename T, typename Counter, typename Hash1, typename Hash2>
129Counter FilterBloom<T, Counter, Hash1, Hash2>::Estimate(const T& item) const {
130 auto hash_value_1 = hasher_1(item);
131 auto hash_value_2 = hasher_2(item);
132 return MinFrequency(hash_value_1, hash_value_2);
133}
134
135template <typename T, typename Counter, typename Hash1, typename Hash2>
136bool FilterBloom<T, Counter, Hash1, Hash2>::Has(const T& item) const {
137 return Estimate(item) > 0;
138}
139
140template <typename T, typename Counter, typename Hash1, typename Hash2>
141void FilterBloom<T, Counter, Hash1, Hash2>::Clear() {
142 for (auto& counter : counters_) {
143 counter = 0;
144 }
145}
146
147} // namespace utils
148
149USERVER_NAMESPACE_END