userver: userver/utils/encoding/tskv.hpp Source File
Loading...
Searching...
No Matches
tskv.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/encoding/tskv.hpp
4/// @brief Encoders, decoders and helpers for TSKV representations
5/// @ingroup userver_universal
6
7#include <array>
8#include <cstddef>
9#include <cstdint>
10#include <cstring>
11#include <limits>
12#include <string_view>
13#include <type_traits>
14
15#ifdef __AVX2__
16#include <immintrin.h>
17#elif defined(__SSSE3__)
18#include <tmmintrin.h>
19#elif defined(__SSE2__)
20#include <emmintrin.h>
21#endif
22
23#include <userver/utils/assert.hpp>
24
25// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
26#define USERVER_IMPL_FORCE_INLINE __attribute__((always_inline)) inline
27
28#ifdef __clang__
29// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
30#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address, no_sanitize_memory, no_sanitize_thread))
31#else
32// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
33#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address))
34#endif
35
36USERVER_NAMESPACE_BEGIN
37
38namespace utils::encoding {
39
40constexpr inline char kTskvKeyValueSeparator = '=';
41constexpr inline char kTskvPairsSeparator = '\t';
42
43// kKeyReplacePeriod is for logging. Elastic has a long history of problems with
44// periods in TSKV keys. For more info see:
45// www.elastic.co/guide/en/elasticsearch/reference/2.4/dots-in-names.html
46enum class EncodeTskvMode { kKey, kValue, kKeyReplacePeriod };
47
48/// @brief Encode according to the TSKV rules, but without escaping the
49/// quotation mark (").
50/// @returns The iterator to after the inserted chars.
51template <typename OutIter>
52OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode);
53
54/// @brief Encode according to the TSKV rules, but without escaping the
55/// quotation mark (").
56/// @note New contents are appended at the end of `container`. Some extra memory
57/// is reserved as necessary.
58/// @tparam Container must be continuous and support at least the following
59/// operations: 1) `c.data()` 2) `c.size()` 3) `c.resize(new_size)`
60template <typename Container>
61void EncodeTskv(Container& container, std::string_view str, EncodeTskvMode mode);
62
63// ==================== Implementation follows ====================
64
65// always_inline to eliminate 'mode' checks
66template <typename OutIter>
67USERVER_IMPL_FORCE_INLINE OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode) {
68 const bool is_key_encoding = (mode == EncodeTskvMode::kKey || mode == EncodeTskvMode::kKeyReplacePeriod);
69 const auto append = [&destination](char ch) { *(destination++) = ch; };
70
71 switch (ch) {
72 case '\t':
73 append('\\');
74 append('t');
75 break;
76 case '\r':
77 append('\\');
78 append('r');
79 break;
80 case '\n':
81 append('\\');
82 append('n');
83 break;
84 case '\0':
85 append('\\');
86 append('0');
87 break;
88 case '\\':
89 append('\\');
90 append(ch);
91 break;
92 case '.':
93 if (mode == EncodeTskvMode::kKeyReplacePeriod) {
94 append('_');
95 break;
96 }
97 [[fallthrough]];
98 case 'A':
99 case 'B':
100 case 'C':
101 case 'D':
102 case 'E':
103 case 'F':
104 case 'G':
105 case 'H':
106 case 'I':
107 case 'J':
108 case 'K':
109 case 'L':
110 case 'M':
111 case 'N':
112 case 'O':
113 case 'P':
114 case 'Q':
115 case 'R':
116 case 'S':
117 case 'T':
118 case 'U':
119 case 'V':
120 case 'W':
121 case 'X':
122 case 'Y':
123 case 'Z':
124 if (is_key_encoding) {
125 append(ch | 0x20); // ch - 'A' + 'a'
126 break;
127 }
128 [[fallthrough]];
129 case '=':
130 if (is_key_encoding) {
131 append('\\');
132 append(ch);
133 break;
134 }
135 [[fallthrough]];
136 default:
137 append(ch);
138 break;
139 }
140 return destination;
141}
142
143namespace impl::tskv {
144
145template <std::size_t Alignment, typename T>
146USERVER_IMPL_FORCE_INLINE T* AlignDown(T* ptr) noexcept {
147 static_assert(Alignment % sizeof(T) == 0);
148 return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(ptr) / Alignment * Alignment);
149}
150
151template <std::size_t Alignment>
152USERVER_IMPL_FORCE_INLINE const char* AssumeAligned(const char* block) noexcept {
153 UASSERT(reinterpret_cast<std::uintptr_t>(block) % Alignment == 0);
154 return static_cast<const char*>(__builtin_assume_aligned(block, Alignment));
155}
156
157constexpr auto MakeShuffleIndicesForRightShift() noexcept {
158 constexpr std::size_t kShuffleWidth = 16;
159 std::array<std::uint8_t, kShuffleWidth * 2> result{};
160 for (auto& item : result) item = 0xf0;
161 for (std::size_t i = 0; i < kShuffleWidth; ++i) result[i] = i;
162 return result;
163}
164
165struct EncoderStd final {
166 using Block = std::uint64_t;
167 static constexpr std::size_t kBlockSize = sizeof(Block);
168
169 // Sanitizers are disabled within the function, because the SIMD loads
170 // may intentionally wander to uninitialized memory. The loads never touch
171 // memory outside "our" cache lines, though.
172 USERVER_IMPL_DISABLE_ASAN inline static auto LoadBlock(const char* block) noexcept {
173 block = AssumeAligned<kBlockSize>(block);
174 return *reinterpret_cast<const Block*>(block);
175 }
176
177 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
178 const auto cut_block = block >> (offset * 8);
179 std::memcpy(destination, &cut_block, sizeof(cut_block));
180 }
181
182 USERVER_IMPL_FORCE_INLINE static bool
183 MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count) noexcept {
184 char buffer[kBlockSize]{};
185 std::memcpy(&buffer, &block, sizeof(block));
186 for (const char c : std::string_view(buffer + offset, count)) {
187 if (c <= '\r' || c == '\\') return true;
188 }
189 return false;
190 }
191};
192
193#ifdef __SSE2__
194struct EncoderSse2 {
195 using Block = __m128i;
196 static constexpr std::size_t kBlockSize = sizeof(Block);
197
198 // Sanitizers are disabled within the function, because the SIMD loads
199 // may intentionally wander to uninitialized memory. The loads never touch
200 // memory outside "our" cache lines, though.
201 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(const char* block) noexcept {
202 block = AssumeAligned<kBlockSize>(block);
203 return _mm_load_si128(reinterpret_cast<const Block*>(block));
204 }
205
206 USERVER_IMPL_FORCE_INLINE static void
207 CopyBlock(Block block_contents, std::size_t offset, char* destination) noexcept {
208 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
209 _mm_store_si128(reinterpret_cast<Block*>(&storage), block_contents);
210 const auto cut_block = _mm_loadu_si128(reinterpret_cast<__m128i_u*>(&storage[offset]));
211 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
212 }
213
214 USERVER_IMPL_FORCE_INLINE static bool
215 MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count) noexcept {
216 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
217 // 16 lower bits of the mask contain may-need-escaping flag per block's char
218 const auto may_need_escaping_mask = _mm_movemask_epi8(
219 _mm_or_si128(_mm_cmpgt_epi8(_mm_set1_epi8('\r' + 1), block), _mm_cmpeq_epi8(block, _mm_set1_epi8('\\')))
220 );
221 return static_cast<std::uint32_t>(
222 static_cast<std::uint32_t>(may_need_escaping_mask) >> offset << (32 - count)
223 ) != 0;
224 }
225};
226#endif
227
228#ifdef __SSSE3__
229struct EncoderSsse3 final : public EncoderSse2 {
230 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
231 static constexpr auto kShuffleIdx = MakeShuffleIndicesForRightShift();
232 const auto pos = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(&kShuffleIdx[offset]));
233 const auto cut_block = _mm_shuffle_epi8(block, pos);
234 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
235 }
236};
237#endif
238
239#ifdef __AVX2__
240struct EncoderAvx2 final {
241 using Block = __m256i;
242 static constexpr std::size_t kBlockSize = sizeof(Block);
243
244 // Sanitizers are disabled within the function, because the SIMD loads
245 // may intentionally wander to uninitialized memory. The loads never touch
246 // memory outside "our" cache lines, though.
247 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(const char* block) noexcept {
248 block = AssumeAligned<kBlockSize>(block);
249 return _mm256_load_si256(reinterpret_cast<const Block*>(block));
250 }
251
252 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
253 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
254 _mm256_store_si256(reinterpret_cast<Block*>(&storage), block);
255 const auto cut_block = _mm256_loadu_si256(reinterpret_cast<__m256i_u*>(&storage[offset]));
256 _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(destination), cut_block);
257 }
258
259 USERVER_IMPL_FORCE_INLINE static bool
260 MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count) noexcept {
261 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
262 // 32 lower bits of the mask contain may-need-escaping flag per block's char
263 const auto may_need_escaping_mask = _mm256_movemask_epi8(_mm256_or_si256(
264 _mm256_cmpgt_epi8(_mm256_set1_epi8('\r' + 1), block), _mm256_cmpeq_epi8(block, _mm256_set1_epi8('\\'))
265 ));
266 return static_cast<std::uint32_t>(
267 static_cast<std::uint32_t>(may_need_escaping_mask) >> offset << (32 - count)
268 ) != 0;
269 }
270};
271#endif
272
273#if defined(__AVX2__)
274using SystemEncoder = EncoderAvx2;
275#elif defined(__SSSE3__)
276using SystemEncoder = EncoderSsse3;
277#elif defined(__SSE2__)
278using SystemEncoder = EncoderSse2;
279#else
280using SystemEncoder = EncoderStd;
281#endif
282
283// It is assumed that starting with the current output position, there is enough
284// free space to fit the converted data, plus `PaddingSize` extra bytes of free
285// space, which the encoder is free to fill with garbage.
286template <typename Encoder>
287constexpr std::size_t PaddingSize() {
288 return Encoder::kBlockSize;
289}
290
291template <typename Encoder>
292struct BufferPtr final {
293 char* current{nullptr};
294};
295
296template <typename Encoder>
297USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> AppendBlock(
298 BufferPtr<Encoder> destination,
299 typename Encoder::Block block,
300 std::size_t offset,
301 std::size_t count
302) noexcept {
303 char* const old_current = destination.current;
304 destination.current += count;
305 Encoder::CopyBlock(block, offset, old_current);
306 return destination;
307}
308
309// noinline to avoid code duplication for a cold path
310template <typename Encoder>
311[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder>
312EncodeValueEach(BufferPtr<Encoder> destination, std::string_view str) {
313 for (const char c : str) {
314 destination.current = encoding::EncodeTskv(destination.current, c, EncodeTskvMode::kValue);
315 }
316 return destination;
317}
318
319template <typename Encoder>
320[[nodiscard]] USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder>
321EncodeValueBlock(BufferPtr<Encoder> destination, const char* block, std::size_t offset, std::size_t count) {
322 UASSERT(offset < Encoder::kBlockSize);
323 UASSERT(offset + count <= Encoder::kBlockSize);
324 block = AssumeAligned<Encoder::kBlockSize>(block);
325 const auto block_contents = Encoder::LoadBlock(block);
326
327 if (__builtin_expect(Encoder::MayNeedValueEscaping(block_contents, offset, count), false)) {
328 destination = tskv::EncodeValueEach(destination, std::string_view(block + offset, count));
329 } else {
330 // happy path: the whole block does not need escaping
331 destination = tskv::AppendBlock(destination, block_contents, offset, count);
332 }
333
334 return destination;
335}
336
337// BufferPtr must be passed around by value to avoid aliasing issues.
338template <typename Encoder>
339[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder>
340EncodeValue(BufferPtr<Encoder> destination, std::string_view str) {
341 if (str.empty()) return destination;
342
343 const char* const first_block = AlignDown<Encoder::kBlockSize>(str.data());
344 const auto first_block_offset = static_cast<std::size_t>(str.data() - first_block);
345 const auto first_block_count = std::min(Encoder::kBlockSize - first_block_offset, str.size());
346
347 destination = tskv::EncodeValueBlock(destination, first_block, first_block_offset, first_block_count);
348
349 const char* const last_block = AlignDown<Encoder::kBlockSize>(str.data() + str.size());
350
351 if (last_block != first_block) {
352 for (const char* current_block = first_block + Encoder::kBlockSize; current_block < last_block;
353 current_block += Encoder::kBlockSize) {
354 destination = tskv::EncodeValueBlock(destination, current_block, 0, Encoder::kBlockSize);
355 }
356
357 const auto last_block_count = static_cast<std::size_t>(str.data() + str.size() - last_block);
358 if (last_block_count != 0) {
359 destination = tskv::EncodeValueBlock(destination, last_block, 0, last_block_count);
360 }
361 }
362
363 return destination;
364}
365
366template <typename Encoder>
367[[nodiscard]] BufferPtr<Encoder> DoEncode(BufferPtr<Encoder> destination, std::string_view str, EncodeTskvMode mode) {
368 if (mode == EncodeTskvMode::kValue) {
369 return tskv::EncodeValue(destination, str);
370 } else {
371 for (const char c : str) {
372 destination.current = encoding::EncodeTskv(destination.current, c, mode);
373 }
374 return destination;
375 }
376}
377
378inline std::size_t MaxEncodedSize(std::size_t source_size) noexcept { return source_size * 2; }
379
380template <typename Encoder, typename Container>
381void EncodeFullyBuffered(Container& container, std::string_view str, EncodeTskvMode mode) {
382 const auto old_size = container.size();
383 container.resize(old_size + MaxEncodedSize(str.size()) + PaddingSize<Encoder>());
384 BufferPtr<Encoder> buffer_ptr{container.data() + old_size};
385
386 buffer_ptr = tskv::DoEncode(buffer_ptr, str, mode);
387
388 container.resize(buffer_ptr.current - container.data());
389}
390
391} // namespace impl::tskv
392
393template <typename Container>
394void EncodeTskv(Container& container, std::string_view str, EncodeTskvMode mode) {
395 impl::tskv::EncodeFullyBuffered<impl::tskv::SystemEncoder>(container, str, mode);
396}
397
398/// @cond
399inline bool ShouldKeyBeEscaped(std::string_view key) noexcept {
400 for (const char ch : key) {
401 switch (ch) {
402 case '\t':
403 case '\r':
404 case '\n':
405 case '\0':
406 case '\\':
407 case '.':
408 case '=':
409 return true;
410 default:
411 if ('A' <= ch && ch <= 'Z') return true;
412 break;
413 }
414 }
415 return false;
416}
417/// @endcond
418
419} // namespace utils::encoding
420
421USERVER_NAMESPACE_END
422
423#undef USERVER_IMPL_FORCE_INLINE
424#undef USERVER_IMPL_DONT_SANITIZE