userver: userver/utils/encoding/tskv.hpp Source File
⚠️ This is the documentation for an old userver version. Click here to switch to the latest version.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages Concepts
tskv.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/encoding/tskv.hpp
4/// @brief Encoders, decoders and helpers for TSKV representations
5/// @ingroup userver_universal
6
7#include <array>
8#include <cstddef>
9#include <cstdint>
10#include <cstring>
11#include <limits>
12#include <string_view>
13#include <type_traits>
14
15#ifdef __AVX2__
16#include <immintrin.h>
17#elif defined(__SSSE3__)
18#include <tmmintrin.h>
19#elif defined(__SSE2__)
20#include <emmintrin.h>
21#endif
22
23#include <userver/utils/assert.hpp>
24
25// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
26#define USERVER_IMPL_FORCE_INLINE __attribute__((always_inline)) inline
27
28#ifdef __clang__
29// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
30#define USERVER_IMPL_DISABLE_ASAN
31 __attribute__((no_sanitize_address, no_sanitize_memory, no_sanitize_thread))
32#else
33// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
34#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address))
35#endif
36
37USERVER_NAMESPACE_BEGIN
38
39namespace utils::encoding {
40
41constexpr inline char kTskvKeyValueSeparator = '=';
42constexpr inline char kTskvPairsSeparator = '\t';
43
44// kKeyReplacePeriod is for logging. Elastic has a long history of problems with
45// periods in TSKV keys. For more info see:
46// www.elastic.co/guide/en/elasticsearch/reference/2.4/dots-in-names.html
47enum class EncodeTskvMode { kKey, kValue, kKeyReplacePeriod };
48
49/// @brief Encode according to the TSKV rules, but without escaping the
50/// quotation mark (").
51/// @returns The iterator to after the inserted chars.
52template <typename OutIter>
53OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode);
54
55/// @brief Encode according to the TSKV rules, but without escaping the
56/// quotation mark (").
57/// @note New contents are appended at the end of `container`. Some extra memory
58/// is reserved as necessary.
59/// @tparam Container must be continuous and support at least the following
60/// operations: 1) `c.data()` 2) `c.size()` 3) `c.resize(new_size)`
61template <typename Container>
62void EncodeTskv(Container& container, std::string_view str,
63 EncodeTskvMode mode);
64
65// ==================== Implementation follows ====================
66
67// always_inline to eliminate 'mode' checks
68template <typename OutIter>
69USERVER_IMPL_FORCE_INLINE OutIter EncodeTskv(OutIter destination, char ch,
70 EncodeTskvMode mode) {
71 const bool is_key_encoding = (mode == EncodeTskvMode::kKey ||
72 mode == EncodeTskvMode::kKeyReplacePeriod);
73 const auto append = [&destination](char ch) { *(destination++) = ch; };
74
75 switch (ch) {
76 case '\t':
77 append('\\');
78 append('t');
79 break;
80 case '\r':
81 append('\\');
82 append('r');
83 break;
84 case '\n':
85 append('\\');
86 append('n');
87 break;
88 case '\0':
89 append('\\');
90 append('0');
91 break;
92 case '\\':
93 append('\\');
94 append(ch);
95 break;
96 case '.':
97 if (mode == EncodeTskvMode::kKeyReplacePeriod) {
98 append('_');
99 break;
100 }
101 [[fallthrough]];
102 case 'A':
103 case 'B':
104 case 'C':
105 case 'D':
106 case 'E':
107 case 'F':
108 case 'G':
109 case 'H':
110 case 'I':
111 case 'J':
112 case 'K':
113 case 'L':
114 case 'M':
115 case 'N':
116 case 'O':
117 case 'P':
118 case 'Q':
119 case 'R':
120 case 'S':
121 case 'T':
122 case 'U':
123 case 'V':
124 case 'W':
125 case 'X':
126 case 'Y':
127 case 'Z':
128 if (is_key_encoding) {
129 append(ch | 0x20); // ch - 'A' + 'a'
130 break;
131 }
132 [[fallthrough]];
133 case '=':
134 if (is_key_encoding) {
135 append('\\');
136 append(ch);
137 break;
138 }
139 [[fallthrough]];
140 default:
141 append(ch);
142 break;
143 }
144 return destination;
145}
146
147namespace impl::tskv {
148
149template <std::size_t Alignment, typename T>
150USERVER_IMPL_FORCE_INLINE T* AlignDown(T* ptr) noexcept {
151 static_assert(Alignment % sizeof(T) == 0);
152 return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(ptr) /
153 Alignment * Alignment);
154}
155
156template <std::size_t Alignment>
157USERVER_IMPL_FORCE_INLINE const char* AssumeAligned(
158 const char* block) noexcept {
159 UASSERT(reinterpret_cast<std::uintptr_t>(block) % Alignment == 0);
160 return static_cast<const char*>(__builtin_assume_aligned(block, Alignment));
161}
162
163constexpr auto MakeShuffleIndicesForRightShift() noexcept {
164 constexpr std::size_t kShuffleWidth = 16;
165 std::array<std::uint8_t, kShuffleWidth * 2> result{};
166 for (auto& item : result) item = 0xf0;
167 for (std::size_t i = 0; i < kShuffleWidth; ++i) result[i] = i;
168 return result;
169}
170
171struct EncoderStd final {
172 using Block = std::uint64_t;
173 static constexpr std::size_t kBlockSize = sizeof(Block);
174
175 // Sanitizers are disabled within the function, because the SIMD loads
176 // may intentionally wander to uninitialized memory. The loads never touch
177 // memory outside "our" cache lines, though.
178 USERVER_IMPL_DISABLE_ASAN inline static auto LoadBlock(
179 const char* block) noexcept {
180 block = AssumeAligned<kBlockSize>(block);
181 return *reinterpret_cast<const Block*>(block);
182 }
183
184 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
185 std::size_t offset,
186 char* destination) noexcept {
187 const auto cut_block = block >> (offset * 8);
188 std::memcpy(destination, &cut_block, sizeof(cut_block));
189 }
190
191 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
192 Block block, std::size_t offset, std::size_t count) noexcept {
193 char buffer[kBlockSize]{};
194 std::memcpy(&buffer, &block, sizeof(block));
195 for (const char c : std::string_view(buffer + offset, count)) {
196 if (c <= '\r' || c == '\\') return true;
197 }
198 return false;
199 }
200};
201
202#ifdef __SSE2__
203struct EncoderSse2 {
204 using Block = __m128i;
205 static constexpr std::size_t kBlockSize = sizeof(Block);
206
207 // Sanitizers are disabled within the function, because the SIMD loads
208 // may intentionally wander to uninitialized memory. The loads never touch
209 // memory outside "our" cache lines, though.
210 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(
211 const char* block) noexcept {
212 block = AssumeAligned<kBlockSize>(block);
213 return _mm_load_si128(reinterpret_cast<const Block*>(block));
214 }
215
216 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block_contents,
217 std::size_t offset,
218 char* destination) noexcept {
219 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
220 _mm_store_si128(reinterpret_cast<Block*>(&storage), block_contents);
221 const auto cut_block =
222 _mm_loadu_si128(reinterpret_cast<__m128i_u*>(&storage[offset]));
223 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
224 }
225
226 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
227 Block block, std::size_t offset, std::size_t count) noexcept {
228 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
229 // 16 lower bits of the mask contain may-need-escaping flag per block's char
230 const auto may_need_escaping_mask = _mm_movemask_epi8(
231 _mm_or_si128(_mm_cmpgt_epi8(_mm_set1_epi8('\r' + 1), block),
232 _mm_cmpeq_epi8(block, _mm_set1_epi8('\\'))));
233 return static_cast<std::uint32_t>(
234 static_cast<std::uint32_t>(may_need_escaping_mask) >>
235 offset << (32 - count)) != 0;
236 }
237};
238#endif
239
240#ifdef __SSSE3__
241struct EncoderSsse3 final : public EncoderSse2 {
242 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
243 std::size_t offset,
244 char* destination) noexcept {
245 static constexpr auto kShuffleIdx = MakeShuffleIndicesForRightShift();
246 const auto pos = _mm_loadu_si128(
247 reinterpret_cast<const __m128i_u*>(&kShuffleIdx[offset]));
248 const auto cut_block = _mm_shuffle_epi8(block, pos);
249 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
250 }
251};
252#endif
253
254#ifdef __AVX2__
255struct EncoderAvx2 final {
256 using Block = __m256i;
257 static constexpr std::size_t kBlockSize = sizeof(Block);
258
259 // Sanitizers are disabled within the function, because the SIMD loads
260 // may intentionally wander to uninitialized memory. The loads never touch
261 // memory outside "our" cache lines, though.
262 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(
263 const char* block) noexcept {
264 block = AssumeAligned<kBlockSize>(block);
265 return _mm256_load_si256(reinterpret_cast<const Block*>(block));
266 }
267
268 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
269 std::size_t offset,
270 char* destination) noexcept {
271 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
272 _mm256_store_si256(reinterpret_cast<Block*>(&storage), block);
273 const auto cut_block =
274 _mm256_loadu_si256(reinterpret_cast<__m256i_u*>(&storage[offset]));
275 _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(destination), cut_block);
276 }
277
278 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
279 Block block, std::size_t offset, std::size_t count) noexcept {
280 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
281 // 32 lower bits of the mask contain may-need-escaping flag per block's char
282 const auto may_need_escaping_mask = _mm256_movemask_epi8(
283 _mm256_or_si256(_mm256_cmpgt_epi8(_mm256_set1_epi8('\r' + 1), block),
284 _mm256_cmpeq_epi8(block, _mm256_set1_epi8('\\'))));
285 return static_cast<std::uint32_t>(
286 static_cast<std::uint32_t>(may_need_escaping_mask) >>
287 offset << (32 - count)) != 0;
288 }
289};
290#endif
291
292#if defined(__AVX2__)
293using SystemEncoder = EncoderAvx2;
294#elif defined(__SSSE3__)
295using SystemEncoder = EncoderSsse3;
296#elif defined(__SSE2__)
297using SystemEncoder = EncoderSse2;
298#else
299using SystemEncoder = EncoderStd;
300#endif
301
302// It is assumed that starting with the current output position, there is enough
303// free space to fit the converted data, plus `PaddingSize` extra bytes of free
304// space, which the encoder is free to fill with garbage.
305template <typename Encoder>
306constexpr std::size_t PaddingSize() {
307 return Encoder::kBlockSize;
308}
309
310template <typename Encoder>
311struct BufferPtr final {
312 char* current{nullptr};
313};
314
315template <typename Encoder>
316USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> AppendBlock(
317 BufferPtr<Encoder> destination, typename Encoder::Block block,
318 std::size_t offset, std::size_t count) noexcept {
319 char* const old_current = destination.current;
320 destination.current += count;
321 Encoder::CopyBlock(block, offset, old_current);
322 return destination;
323}
324
325// noinline to avoid code duplication for a cold path
326template <typename Encoder>
327[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValueEach(
328 BufferPtr<Encoder> destination, std::string_view str) {
329 for (const char c : str) {
330 destination.current =
331 encoding::EncodeTskv(destination.current, c, EncodeTskvMode::kValue);
332 }
333 return destination;
334}
335
336template <typename Encoder>
337[[nodiscard]] USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> EncodeValueBlock(
338 BufferPtr<Encoder> destination, const char* block, std::size_t offset,
339 std::size_t count) {
340 UASSERT(offset < Encoder::kBlockSize);
341 UASSERT(offset + count <= Encoder::kBlockSize);
342 block = AssumeAligned<Encoder::kBlockSize>(block);
343 const auto block_contents = Encoder::LoadBlock(block);
344
345 if (__builtin_expect(
346 Encoder::MayNeedValueEscaping(block_contents, offset, count),
347 false)) {
348 destination = tskv::EncodeValueEach(
349 destination, std::string_view(block + offset, count));
350 } else {
351 // happy path: the whole block does not need escaping
352 destination = tskv::AppendBlock(destination, block_contents, offset, count);
353 }
354
355 return destination;
356}
357
358// BufferPtr must be passed around by value to avoid aliasing issues.
359template <typename Encoder>
360[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValue(
361 BufferPtr<Encoder> destination, std::string_view str) {
362 if (str.empty()) return destination;
363
364 const char* const first_block = AlignDown<Encoder::kBlockSize>(str.data());
365 const auto first_block_offset =
366 static_cast<std::size_t>(str.data() - first_block);
367 const auto first_block_count =
368 std::min(Encoder::kBlockSize - first_block_offset, str.size());
369
370 destination = tskv::EncodeValueBlock(destination, first_block,
371 first_block_offset, first_block_count);
372
373 const char* const last_block =
374 AlignDown<Encoder::kBlockSize>(str.data() + str.size());
375
376 if (last_block != first_block) {
377 for (const char* current_block = first_block + Encoder::kBlockSize;
378 current_block < last_block; current_block += Encoder::kBlockSize) {
379 destination = tskv::EncodeValueBlock(destination, current_block, 0,
380 Encoder::kBlockSize);
381 }
382
383 const auto last_block_count =
384 static_cast<std::size_t>(str.data() + str.size() - last_block);
385 if (last_block_count != 0) {
386 destination =
387 tskv::EncodeValueBlock(destination, last_block, 0, last_block_count);
388 }
389 }
390
391 return destination;
392}
393
394template <typename Encoder>
395[[nodiscard]] BufferPtr<Encoder> DoEncode(BufferPtr<Encoder> destination,
396 std::string_view str,
397 EncodeTskvMode mode) {
398 if (mode == EncodeTskvMode::kValue) {
399 return tskv::EncodeValue(destination, str);
400 } else {
401 for (const char c : str) {
402 destination.current = encoding::EncodeTskv(destination.current, c, mode);
403 }
404 return destination;
405 }
406}
407
408inline std::size_t MaxEncodedSize(std::size_t source_size) noexcept {
409 return source_size * 2;
410}
411
412template <typename Encoder, typename Container>
413void EncodeFullyBuffered(Container& container, std::string_view str,
414 EncodeTskvMode mode) {
415 const auto old_size = container.size();
416 container.resize(old_size + MaxEncodedSize(str.size()) +
417 PaddingSize<Encoder>());
418 BufferPtr<Encoder> buffer_ptr{container.data() + old_size};
419
420 buffer_ptr = tskv::DoEncode(buffer_ptr, str, mode);
421
422 container.resize(buffer_ptr.current - container.data());
423}
424
425} // namespace impl::tskv
426
427template <typename Container>
428void EncodeTskv(Container& container, std::string_view str,
429 EncodeTskvMode mode) {
430 impl::tskv::EncodeFullyBuffered<impl::tskv::SystemEncoder>(container, str,
431 mode);
432}
433
434/// @cond
435inline bool ShouldKeyBeEscaped(std::string_view key) noexcept {
436 for (const char ch : key) {
437 switch (ch) {
438 case '\t':
439 case '\r':
440 case '\n':
441 case '\0':
442 case '\\':
443 case '.':
444 case '=':
445 return true;
446 default:
447 if ('A' <= ch && ch <= 'Z') return true;
448 break;
449 }
450 }
451 return false;
452}
453/// @endcond
454
455} // namespace utils::encoding
456
457USERVER_NAMESPACE_END
458
459#undef USERVER_IMPL_FORCE_INLINE
460#undef USERVER_IMPL_DONT_SANITIZE