userver: userver/utils/encoding/tskv.hpp Source File
Loading...
Searching...
No Matches
tskv.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/encoding/tskv.hpp
4/// @brief Encoders, decoders and helpers for TSKV representations
5/// @ingroup userver_universal
6
7#include <array>
8#include <cstddef>
9#include <cstdint>
10#include <cstring>
11#include <limits>
12#include <string_view>
13#include <type_traits>
14
15#ifdef __AVX2__
16#include <immintrin.h>
17#elif defined(__SSSE3__)
18#include <tmmintrin.h>
19#elif defined(__SSE2__)
20#include <emmintrin.h>
21#endif
22
23#include <userver/utils/assert.hpp>
24
25// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
26#define USERVER_IMPL_FORCE_INLINE __attribute__((always_inline)) inline
27
28#ifdef __clang__
29// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
30#define USERVER_IMPL_DISABLE_ASAN
31 __attribute__((no_sanitize_address, no_sanitize_memory, no_sanitize_thread))
32#else
33// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
34#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address))
35#endif
36
37USERVER_NAMESPACE_BEGIN
38
39namespace utils::encoding {
40
41constexpr inline char kTskvKeyValueSeparator = '=';
42constexpr inline char kTskvPairsSeparator = '\t';
43
44// kKeyReplacePeriod is for logging. Elastic has a long history of problems with
45// periods in TSKV keys. For more info see:
46// www.elastic.co/guide/en/elasticsearch/reference/2.4/dots-in-names.html
47enum class EncodeTskvMode { kKey, kValue, kKeyReplacePeriod };
48
49/// @brief Encode according to the TSKV rules, but without escaping the
50/// quotation mark (").
51/// @returns The iterator to after the inserted chars.
52template <typename OutIter>
53OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode);
54
55/// @brief Encode according to the TSKV rules, but without escaping the
56/// quotation mark (").
57/// @note New contents are appended at the end of `container`. Some extra memory
58/// is reserved as necessary.
59/// @tparam Container must be continuous and support at least the following
60/// operations: 1) `c.data()` 2) `c.size()` 3) `c.resize(new_size)`
61template <typename Container>
62void EncodeTskv(Container& container, std::string_view str,
63 EncodeTskvMode mode);
64
65// ==================== Implementation follows ====================
66
67// always_inline to eliminate 'mode' checks
68template <typename OutIter>
69USERVER_IMPL_FORCE_INLINE OutIter EncodeTskv(OutIter destination, char ch,
70 EncodeTskvMode mode) {
71 const bool is_key_encoding = (mode == EncodeTskvMode::kKey ||
72 mode == EncodeTskvMode::kKeyReplacePeriod);
73 const auto append = [&destination](char ch) { *(destination++) = ch; };
74
75 switch (ch) {
76 case '\t':
77 append('\\');
78 append('t');
79 break;
80 case '\r':
81 append('\\');
82 append('r');
83 break;
84 case '\n':
85 append('\\');
86 append('n');
87 break;
88 case '\0':
89 append('\\');
90 append('0');
91 break;
92 case '\\':
93 append('\\');
94 append(ch);
95 break;
96 case '.':
97 if (mode == EncodeTskvMode::kKeyReplacePeriod) {
98 append('_');
99 break;
100 }
101 [[fallthrough]];
102 case 'A':
103 case 'B':
104 case 'C':
105 case 'D':
106 case 'E':
107 case 'F':
108 case 'G':
109 case 'H':
110 case 'I':
111 case 'J':
112 case 'K':
113 case 'L':
114 case 'M':
115 case 'N':
116 case 'O':
117 case 'P':
118 case 'Q':
119 case 'R':
120 case 'S':
121 case 'T':
122 case 'U':
123 case 'V':
124 case 'W':
125 case 'X':
126 case 'Y':
127 case 'Z':
128 if (is_key_encoding) {
129 append(ch | 0x20); // ch - 'A' + 'a'
130 break;
131 }
132 [[fallthrough]];
133 case '=':
134 if (is_key_encoding) {
135 append('\\');
136 append(ch);
137 break;
138 }
139 [[fallthrough]];
140 default:
141 append(ch);
142 break;
143 }
144 return destination;
145}
146
147namespace impl::tskv {
148
149template <std::size_t Alignment, typename T>
150USERVER_IMPL_FORCE_INLINE T* AlignDown(T* ptr) noexcept {
151 static_assert(Alignment % sizeof(T) == 0);
152 return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(ptr) /
153 Alignment * Alignment);
154}
155
156template <std::size_t Alignment>
157USERVER_IMPL_FORCE_INLINE const char* AssumeAligned(
158 const char* block) noexcept {
159 UASSERT(reinterpret_cast<std::uintptr_t>(block) % Alignment == 0);
160 return static_cast<const char*>(__builtin_assume_aligned(block, Alignment));
161}
162
163constexpr auto MakeShuffleIndicesForRightShift() noexcept {
164 constexpr std::size_t kShuffleWidth = 16;
165 std::array<std::uint8_t, kShuffleWidth * 2> result{};
166 for (auto& item : result) item = 0xf0;
167 for (std::size_t i = 0; i < kShuffleWidth; ++i) result[i] = i;
168 return result;
169}
170
171struct EncoderStd final {
172 using Block = std::uint64_t;
173 static constexpr std::size_t kBlockSize = sizeof(Block);
174
175 // Sanitizers are disabled within the function, because the SIMD loads
176 // may intentionally wander to uninitialized memory. The loads never touch
177 // memory outside "our" cache lines, though.
178 USERVER_IMPL_DISABLE_ASAN inline static auto LoadBlock(
179 const char* block) noexcept {
180 block = AssumeAligned<kBlockSize>(block);
181 return *reinterpret_cast<const Block*>(block);
182 }
183
184 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
185 std::size_t offset,
186 char* destination) noexcept {
187 const auto cut_block = block >> (offset * 8);
188 std::memcpy(destination, &cut_block, sizeof(cut_block));
189 }
190
191 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
192 Block block, std::size_t offset, std::size_t count) noexcept {
193 char buffer[kBlockSize]{};
194 std::memcpy(&buffer, &block, sizeof(block));
195 for (const char c : std::string_view(buffer + offset, count)) {
196 if (c <= '\r' || c == '\\') return true;
197 }
198 return false;
199 }
200};
201
202#ifdef __SSE2__
203struct EncoderSse2 {
204 using Block = __m128i;
205 static constexpr std::size_t kBlockSize = sizeof(Block);
206
207 // Sanitizers are disabled within the function, because the SIMD loads
208 // may intentionally wander to uninitialized memory. The loads never touch
209 // memory outside "our" cache lines, though.
210 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(
211 const char* block) noexcept {
212 block = AssumeAligned<kBlockSize>(block);
213 return _mm_load_si128(reinterpret_cast<const Block*>(block));
214 }
215
216 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block_contents,
217 std::size_t offset,
218 char* destination) noexcept {
219 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
220 _mm_store_si128(reinterpret_cast<Block*>(&storage), block_contents);
221 const auto cut_block =
222 _mm_loadu_si128(reinterpret_cast<__m128i_u*>(&storage[offset]));
223 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
224 }
225
226 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
227 Block block, std::size_t offset, std::size_t count) noexcept {
228 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
229 // 16 lower bits of the mask contain may-need-escaping flag per block's char
230 const auto may_need_escaping_mask = _mm_movemask_epi8(
231 _mm_or_si128(_mm_cmpgt_epi8(_mm_set1_epi8('\r' + 1), block),
232 _mm_cmpeq_epi8(block, _mm_set1_epi8('\\'))));
233 return static_cast<std::uint32_t>(
234 static_cast<std::uint32_t>(may_need_escaping_mask) >>
235 offset << (32 - count)) != 0;
236 }
237};
238#endif
239
240#ifdef __SSSE3__
241struct EncoderSsse3 final : public EncoderSse2 {
242 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
243 std::size_t offset,
244 char* destination) noexcept {
245 static constexpr auto kShuffleIdx = MakeShuffleIndicesForRightShift();
246 const auto pos = _mm_loadu_si128(
247 reinterpret_cast<const __m128i_u*>(&kShuffleIdx[offset]));
248 const auto cut_block = _mm_shuffle_epi8(block, pos);
249 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
250 }
251};
252#endif
253
254#ifdef __AVX2__
255struct EncoderAvx2 final {
256 using Block = __m256i;
257 static constexpr std::size_t kBlockSize = sizeof(Block);
258
259 // Sanitizers are disabled within the function, because the SIMD loads
260 // may intentionally wander to uninitialized memory. The loads never touch
261 // memory outside "our" cache lines, though.
262 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(
263 const char* block) noexcept {
264 block = AssumeAligned<kBlockSize>(block);
265 return _mm256_load_si256(reinterpret_cast<const Block*>(block));
266 }
267
268 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block,
269 std::size_t offset,
270 char* destination) noexcept {
271 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
272 _mm256_store_si256(reinterpret_cast<Block*>(&storage), block);
273 const auto cut_block =
274 _mm256_loadu_si256(reinterpret_cast<__m256i_u*>(&storage[offset]));
275 _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(destination), cut_block);
276 }
277
278 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(
279 Block block, std::size_t offset, std::size_t count) noexcept {
280 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
281 // 32 lower bits of the mask contain may-need-escaping flag per block's char
282 const auto may_need_escaping_mask = _mm256_movemask_epi8(
283 _mm256_or_si256(_mm256_cmpgt_epi8(_mm256_set1_epi8('\r' + 1), block),
284 _mm256_cmpeq_epi8(block, _mm256_set1_epi8('\\'))));
285 return static_cast<std::uint32_t>(
286 static_cast<std::uint32_t>(may_need_escaping_mask) >>
287 offset << (32 - count)) != 0;
288 }
289};
290#endif
291
292#if defined(__AVX2__)
293using SystemEncoder = EncoderAvx2;
294#elif defined(__SSSE3__)
295using SystemEncoder = EncoderSsse3;
296#elif defined(__SSE2__)
297using SystemEncoder = EncoderSse2;
298#else
299using SystemEncoder = EncoderStd;
300#endif
301
302// It is assumed that starting with the current output position, there is enough
303// free space to fit the converted data, plus `PaddingSize` extra bytes of free
304// space, which the encoder is free to fill with garbage.
305template <typename Encoder>
306constexpr std::size_t PaddingSize() {
307 return Encoder::kBlockSize;
308}
309
310template <typename Encoder>
311struct BufferPtr final {
312 char* current{nullptr};
313};
314
315template <typename Encoder>
316USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> AppendBlock(
317 BufferPtr<Encoder> destination, typename Encoder::Block block,
318 std::size_t offset, std::size_t count) noexcept {
319 char* const old_current = destination.current;
320 destination.current += count;
321 Encoder::CopyBlock(block, offset, old_current);
322 return destination;
323}
324
325// noinline to avoid code duplication for a cold path
326template <typename Encoder>
327[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValueEach(
328 BufferPtr<Encoder> destination, std::string_view str) {
329 for (const char c : str) {
330 destination.current =
331 encoding::EncodeTskv(destination.current, c, EncodeTskvMode::kValue);
332 }
333 return destination;
334}
335
336template <typename Encoder>
337[[nodiscard]] USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> EncodeValueBlock(
338 BufferPtr<Encoder> destination, const char* block, std::size_t offset,
339 std::size_t count) {
340 UASSERT(offset < Encoder::kBlockSize);
341 UASSERT(offset + count <= Encoder::kBlockSize);
342 block = AssumeAligned<Encoder::kBlockSize>(block);
343 const auto block_contents = Encoder::LoadBlock(block);
344
345 if (__builtin_expect(
346 Encoder::MayNeedValueEscaping(block_contents, offset, count),
347 false)) {
348 destination = tskv::EncodeValueEach(
349 destination, std::string_view(block + offset, count));
350 } else {
351 // happy path: the whole block does not need escaping
352 destination = tskv::AppendBlock(destination, block_contents, offset, count);
353 }
354
355 return destination;
356}
357
358// BufferPtr must be passed around by value to avoid aliasing issues.
359template <typename Encoder>
360[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValue(
361 BufferPtr<Encoder> destination, std::string_view str) {
362 if (str.empty()) return destination;
363
364 const char* const first_block = AlignDown<Encoder::kBlockSize>(str.data());
365 const auto first_block_offset =
366 static_cast<std::size_t>(str.data() - first_block);
367 const auto first_block_count =
368 std::min(Encoder::kBlockSize - first_block_offset, str.size());
369
370 destination = tskv::EncodeValueBlock(destination, first_block,
371 first_block_offset, first_block_count);
372
373 const char* const last_block =
374 AlignDown<Encoder::kBlockSize>(str.data() + str.size());
375
376 if (last_block != first_block) {
377 for (const char* current_block = first_block + Encoder::kBlockSize;
378 current_block < last_block; current_block += Encoder::kBlockSize) {
379 destination = tskv::EncodeValueBlock(destination, current_block, 0,
380 Encoder::kBlockSize);
381 }
382
383 const auto last_block_count =
384 static_cast<std::size_t>(str.data() + str.size() - last_block);
385 if (last_block_count != 0) {
386 destination =
387 tskv::EncodeValueBlock(destination, last_block, 0, last_block_count);
388 }
389 }
390
391 return destination;
392}
393
394template <typename Encoder>
395[[nodiscard]] BufferPtr<Encoder> DoEncode(BufferPtr<Encoder> destination,
396 std::string_view str,
397 EncodeTskvMode mode) {
398 if (mode == EncodeTskvMode::kValue) {
399 return tskv::EncodeValue(destination, str);
400 } else {
401 for (const char c : str) {
402 destination.current = encoding::EncodeTskv(destination.current, c, mode);
403 }
404 return destination;
405 }
406}
407
408inline std::size_t MaxEncodedSize(std::size_t source_size) noexcept {
409 return source_size * 2;
410}
411
412template <typename Encoder, typename Container>
413void EncodeFullyBuffered(Container& container, std::string_view str,
414 EncodeTskvMode mode) {
415 const auto old_size = container.size();
416 container.resize(old_size + MaxEncodedSize(str.size()) +
417 PaddingSize<Encoder>());
418 BufferPtr<Encoder> buffer_ptr{container.data() + old_size};
419
420 buffer_ptr = tskv::DoEncode(buffer_ptr, str, mode);
421
422 container.resize(buffer_ptr.current - container.data());
423}
424
425} // namespace impl::tskv
426
427template <typename Container>
428void EncodeTskv(Container& container, std::string_view str,
429 EncodeTskvMode mode) {
430 impl::tskv::EncodeFullyBuffered<impl::tskv::SystemEncoder>(container, str,
431 mode);
432}
433
434/// @cond
435inline bool ShouldKeyBeEscaped(std::string_view key) noexcept {
436 for (const char ch : key) {
437 switch (ch) {
438 case '\t':
439 case '\r':
440 case '\n':
441 case '\0':
442 case '\\':
443 case '.':
444 case '=':
445 return true;
446 default:
447 if ('A' <= ch && ch <= 'Z') return true;
448 break;
449 }
450 }
451 return false;
452}
453/// @endcond
454
455} // namespace utils::encoding
456
457USERVER_NAMESPACE_END
458
459#undef USERVER_IMPL_FORCE_INLINE
460#undef USERVER_IMPL_DONT_SANITIZE