userver: userver/utils/filter_bloom.hpp Source File
⚠️ This is the documentation for an old userver version. Click here to switch to the latest version.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages Concepts
filter_bloom.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/filter_bloom.hpp
4/// @brief @copybrief utils::FilterBloom
5
6#include <array>
7#include <functional>
8#include <optional>
9#include <type_traits>
10#include <utility>
11
12#include <boost/container_hash/hash.hpp>
13
14#include <userver/utils/assert.hpp>
15#include <userver/utils/fixed_array.hpp>
16
17USERVER_NAMESPACE_BEGIN
18
19namespace utils {
20
21/// @ingroup userver_universal
22///
23/// @brief Space-efficient probabilistic data structure
24///
25/// @details Used to test whether a count number of a given element is
26/// smaller than a given threshold when a sequence of elements is given.
27/// As a generalized form of Bloom filter,
28/// false positive matches are possible, but false negatives are not.
29/// @param T the type of element that counts
30/// @param Counter the type of counter
31/// @param Hash1 the first callable hash struct
32/// @param Hash2 the second callable hash struct
33///
34/// Example:
35/// @snippet src/utils/filter_bloom_test.cpp Sample filter bloom usage
36template <typename T, typename Counter = unsigned,
37 typename Hash1 = boost::hash<T>, typename Hash2 = std::hash<T>>
38class FilterBloom final {
39 public:
40 /// @brief Constructs filter Bloom with the specified number of counters
41 /// @note If expected to increment n times is recommended to set counters_num
42 /// to 16 * n
43 explicit FilterBloom(std::size_t counters_num = 256, Hash1 hash_1 = Hash1{},
44 Hash2 hash_2 = Hash2{})
45 : counters_(counters_num, 0),
46 hasher_1(std::move(hash_1)),
47 hasher_2(std::move(hash_2)) {
48 UASSERT((!std::is_same_v<Hash1, Hash2>));
49 UASSERT((std::is_same_v<std::invoke_result_t<Hash1, const T&>,
50 std::invoke_result_t<Hash2, const T&>>));
51 }
52
53 /// @brief Increments the smallest item counters
54 void Increment(const T& item);
55
56 /// @brief Returns the value of the smallest item counter
57 Counter Estimate(const T& item) const;
58
59 /// @brief Checks that all counters of the item have been incremented
60 bool Has(const T& item) const;
61
62 /// @brief Resets all counters
63 void Clear();
64
65 private:
66 using HashedType = std::invoke_result_t<Hash1, const T&>;
67
68 inline uint64_t Coefficient(std::size_t step) const;
69 uint64_t GetHash(const HashedType& hashed_value_1,
70 const HashedType& hashed_value_2,
71 uint64_t coefficient) const;
72 Counter MinFrequency(const HashedType& hashed_value_1,
73 const HashedType& hashed_value_2) const;
74
75 utils::FixedArray<Counter> counters_;
76 const Hash1 hasher_1;
77 const Hash2 hasher_2;
78
79 static constexpr std::size_t kHashFunctionsCount = 4;
80};
81
82template <typename T, typename Counter, typename Hash1, typename Hash2>
83inline uint64_t FilterBloom<T, Counter, Hash1, Hash2>::Coefficient(
84 std::size_t step) const {
85 return 1 << step;
86}
87
88template <typename T, typename Counter, typename Hash1, typename Hash2>
89uint64_t FilterBloom<T, Counter, Hash1, Hash2>::GetHash(
90 const HashedType& hashed_value_1, const HashedType& hashed_value_2,
91 uint64_t coefficient) const {
92 // the idea was taken from
93 // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
94 return hashed_value_1 + hashed_value_2 * coefficient;
95}
96
97template <typename T, typename Counter, typename Hash1, typename Hash2>
98Counter FilterBloom<T, Counter, Hash1, Hash2>::MinFrequency(
99 const HashedType& hashed_value_1, const HashedType& hashed_value_2) const {
100 std::optional<Counter> min_count;
101 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
102 auto current_count =
103 counters_[GetHash(hashed_value_1, hashed_value_2, Coefficient(step)) %
104 counters_.size()];
105 if (!min_count.has_value() || min_count.value() > current_count) {
106 min_count = current_count;
107 }
108 }
109 return min_count.value();
110}
111
112template <typename T, typename Counter, typename Hash1, typename Hash2>
113void FilterBloom<T, Counter, Hash1, Hash2>::Increment(const T& item) {
114 auto hash_value_1 = hasher_1(item);
115 auto hash_value_2 = hasher_2(item);
116 Counter min_frequency = MinFrequency(hash_value_1, hash_value_2);
117
118 for (std::size_t step = 0; step < kHashFunctionsCount; ++step) {
119 auto& current_count =
120 counters_[GetHash(hash_value_1, hash_value_2, Coefficient(step)) %
121 counters_.size()];
122 if (current_count == min_frequency) {
123 current_count++;
124 }
125 }
126}
127
128template <typename T, typename Counter, typename Hash1, typename Hash2>
129Counter FilterBloom<T, Counter, Hash1, Hash2>::Estimate(const T& item) const {
130 auto hash_value_1 = hasher_1(item);
131 auto hash_value_2 = hasher_2(item);
132 return MinFrequency(hash_value_1, hash_value_2);
133}
134
135template <typename T, typename Counter, typename Hash1, typename Hash2>
136bool FilterBloom<T, Counter, Hash1, Hash2>::Has(const T& item) const {
137 return Estimate(item) > 0;
138}
139
140template <typename T, typename Counter, typename Hash1, typename Hash2>
141void FilterBloom<T, Counter, Hash1, Hash2>::Clear() {
142 for (auto& counter : counters_) {
143 counter = 0;
144 }
145}
146
147} // namespace utils
148
149USERVER_NAMESPACE_END