userver: userver/utils/encoding/tskv.hpp Source File
Loading...
Searching...
No Matches
tskv.hpp
Go to the documentation of this file.
1#pragma once
2
3/// @file userver/utils/encoding/tskv.hpp
4/// @brief Encoders, decoders and helpers for TSKV representations
5/// @ingroup userver_universal
6
7#include <array>
8#include <cstddef>
9#include <cstdint>
10#include <cstring>
11#include <limits>
12#include <string_view>
13#include <type_traits>
14
15#ifdef __AVX2__
16#include <immintrin.h>
17#elif defined(__SSSE3__)
18#include <tmmintrin.h>
19#elif defined(__SSE2__)
20#include <emmintrin.h>
21#endif
22
23#include <userver/utils/assert.hpp>
24
25// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
26#define USERVER_IMPL_FORCE_INLINE __attribute__((always_inline)) inline
27
28#ifdef __clang__
29// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
30#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address, no_sanitize_memory, no_sanitize_thread))
31#else
32// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
33#define USERVER_IMPL_DISABLE_ASAN __attribute__((no_sanitize_address))
34#endif
35
36USERVER_NAMESPACE_BEGIN
37
38namespace utils::encoding {
39
40constexpr inline char kTskvKeyValueSeparator = '=';
41constexpr inline char kTskvPairsSeparator = '\t';
42
43// kKeyReplacePeriod is for logging. Elastic has a long history of problems with
44// periods in TSKV keys. For more info see:
45// www.elastic.co/guide/en/elasticsearch/reference/2.4/dots-in-names.html
46enum class EncodeTskvMode { kKey, kValue, kKeyReplacePeriod };
47
48/// @brief Encode according to the TSKV rules, but without escaping the
49/// quotation mark (").
50/// @returns The iterator to after the inserted chars.
51template <typename OutIter>
52OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode);
53
54/// @brief Encode according to the TSKV rules, but without escaping the
55/// quotation mark (").
56/// @note New contents are appended at the end of `container`. Some extra memory
57/// is reserved as necessary.
58/// @tparam Container must be continuous and support at least the following
59/// operations: 1) `c.data()` 2) `c.size()` 3) `c.resize(new_size)`
60template <typename Container>
61void EncodeTskv(Container& container, std::string_view str, EncodeTskvMode mode);
62
63// ==================== Implementation follows ====================
64
65// always_inline to eliminate 'mode' checks
66template <typename OutIter>
67USERVER_IMPL_FORCE_INLINE OutIter EncodeTskv(OutIter destination, char ch, EncodeTskvMode mode) {
68 const bool is_key_encoding = (mode == EncodeTskvMode::kKey || mode == EncodeTskvMode::kKeyReplacePeriod);
69 const auto append = [&destination](char ch) { *(destination++) = ch; };
70
71 switch (ch) {
72 case '\t':
73 append('\\');
74 append('t');
75 break;
76 case '\r':
77 append('\\');
78 append('r');
79 break;
80 case '\n':
81 append('\\');
82 append('n');
83 break;
84 case '\0':
85 append('\\');
86 append('0');
87 break;
88 case '\\':
89 append('\\');
90 append(ch);
91 break;
92 case '.':
93 if (mode == EncodeTskvMode::kKeyReplacePeriod) {
94 append('_');
95 break;
96 }
97 [[fallthrough]];
98 case 'A':
99 case 'B':
100 case 'C':
101 case 'D':
102 case 'E':
103 case 'F':
104 case 'G':
105 case 'H':
106 case 'I':
107 case 'J':
108 case 'K':
109 case 'L':
110 case 'M':
111 case 'N':
112 case 'O':
113 case 'P':
114 case 'Q':
115 case 'R':
116 case 'S':
117 case 'T':
118 case 'U':
119 case 'V':
120 case 'W':
121 case 'X':
122 case 'Y':
123 case 'Z':
124 if (is_key_encoding) {
125 append(ch | 0x20); // ch - 'A' + 'a'
126 break;
127 }
128 [[fallthrough]];
129 case '=':
130 if (is_key_encoding) {
131 append('\\');
132 append(ch);
133 break;
134 }
135 [[fallthrough]];
136 default:
137 append(ch);
138 break;
139 }
140 return destination;
141}
142
143namespace impl::tskv {
144
145template <std::size_t Alignment, typename T>
146USERVER_IMPL_FORCE_INLINE T* AlignDown(T* ptr) noexcept {
147 static_assert(Alignment % sizeof(T) == 0);
148 return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(ptr) / Alignment * Alignment);
149}
150
151template <std::size_t Alignment>
152USERVER_IMPL_FORCE_INLINE const char* AssumeAligned(const char* block) noexcept {
153 UASSERT(reinterpret_cast<std::uintptr_t>(block) % Alignment == 0);
154 return static_cast<const char*>(__builtin_assume_aligned(block, Alignment));
155}
156
157constexpr auto MakeShuffleIndicesForRightShift() noexcept {
158 constexpr std::size_t kShuffleWidth = 16;
159 std::array<std::uint8_t, kShuffleWidth * 2> result{};
160 for (auto& item : result) {
161 item = 0xf0;
162 }
163 for (std::size_t i = 0; i < kShuffleWidth; ++i) {
164 result[i] = i;
165 }
166 return result;
167}
168
169struct EncoderStd final {
170 using Block = std::uint64_t;
171 static constexpr std::size_t kBlockSize = sizeof(Block);
172
173 // Sanitizers are disabled within the function, because the SIMD loads
174 // may intentionally wander to uninitialized memory. The loads never touch
175 // memory outside "our" cache lines, though.
176 USERVER_IMPL_DISABLE_ASAN inline static auto LoadBlock(const char* block) noexcept {
177 block = AssumeAligned<kBlockSize>(block);
178 return *reinterpret_cast<const Block*>(block);
179 }
180
181 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
182 const auto cut_block = block >> (offset * 8);
183 std::memcpy(destination, &cut_block, sizeof(cut_block));
184 }
185
186 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count)
187 noexcept {
188 char buffer[kBlockSize]{};
189 std::memcpy(&buffer, &block, sizeof(block));
190 for (const char c : std::string_view(buffer + offset, count)) {
191 if (c <= '\r' || c == '\\') {
192 return true;
193 }
194 }
195 return false;
196 }
197};
198
199#ifdef __SSE2__
200struct EncoderSse2 {
201 using Block = __m128i;
202 static constexpr std::size_t kBlockSize = sizeof(Block);
203
204 // Sanitizers are disabled within the function, because the SIMD loads
205 // may intentionally wander to uninitialized memory. The loads never touch
206 // memory outside "our" cache lines, though.
207 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(const char* block) noexcept {
208 block = AssumeAligned<kBlockSize>(block);
209 return _mm_load_si128(reinterpret_cast<const Block*>(block));
210 }
211
212 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block_contents, std::size_t offset, char* destination)
213 noexcept {
214 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
215 _mm_store_si128(reinterpret_cast<Block*>(&storage), block_contents);
216 const auto cut_block = _mm_loadu_si128(reinterpret_cast<__m128i_u*>(&storage[offset]));
217 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
218 }
219
220 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count)
221 noexcept {
222 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
223 // 16 lower bits of the mask contain may-need-escaping flag per block's char
224 const auto may_need_escaping_mask = _mm_movemask_epi8(
225 _mm_or_si128(_mm_cmpgt_epi8(_mm_set1_epi8('\r' + 1), block), _mm_cmpeq_epi8(block, _mm_set1_epi8('\\')))
226 );
227 return static_cast<
228 std::uint32_t>(static_cast<std::uint32_t>(may_need_escaping_mask) >> offset << (32 - count)) != 0;
229 }
230};
231#endif
232
233#ifdef __SSSE3__
234struct EncoderSsse3 final : public EncoderSse2 {
235 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
236 static constexpr auto kShuffleIdx = MakeShuffleIndicesForRightShift();
237 const auto pos = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(&kShuffleIdx[offset]));
238 const auto cut_block = _mm_shuffle_epi8(block, pos);
239 _mm_storeu_si128(reinterpret_cast<__m128i_u*>(destination), cut_block);
240 }
241};
242#endif
243
244#ifdef __AVX2__
245struct EncoderAvx2 final {
246 using Block = __m256i;
247 static constexpr std::size_t kBlockSize = sizeof(Block);
248
249 // Sanitizers are disabled within the function, because the SIMD loads
250 // may intentionally wander to uninitialized memory. The loads never touch
251 // memory outside "our" cache lines, though.
252 USERVER_IMPL_DISABLE_ASAN inline static Block LoadBlock(const char* block) noexcept {
253 block = AssumeAligned<kBlockSize>(block);
254 return _mm256_load_si256(reinterpret_cast<const Block*>(block));
255 }
256
257 USERVER_IMPL_FORCE_INLINE static void CopyBlock(Block block, std::size_t offset, char* destination) noexcept {
258 alignas(kBlockSize * 2) char storage[kBlockSize * 2]{};
259 _mm256_store_si256(reinterpret_cast<Block*>(&storage), block);
260 const auto cut_block = _mm256_loadu_si256(reinterpret_cast<__m256i_u*>(&storage[offset]));
261 _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(destination), cut_block);
262 }
263
264 USERVER_IMPL_FORCE_INLINE static bool MayNeedValueEscaping(Block block, std::size_t offset, std::size_t count)
265 noexcept {
266 // 'char c' may need TSKV value escaping iff c <= '\r' || c == '\\'
267 // 32 lower bits of the mask contain may-need-escaping flag per block's char
268 const auto may_need_escaping_mask = _mm256_movemask_epi8(_mm256_or_si256(
269 _mm256_cmpgt_epi8(_mm256_set1_epi8('\r' + 1), block),
270 _mm256_cmpeq_epi8(block, _mm256_set1_epi8('\\'))
271 ));
272 return static_cast<
273 std::uint32_t>(static_cast<std::uint32_t>(may_need_escaping_mask) >> offset << (32 - count)) != 0;
274 }
275};
276#endif
277
278#if defined(__AVX2__)
279using SystemEncoder = EncoderAvx2;
280#elif defined(__SSSE3__)
281using SystemEncoder = EncoderSsse3;
282#elif defined(__SSE2__)
283using SystemEncoder = EncoderSse2;
284#else
285using SystemEncoder = EncoderStd;
286#endif
287
288// It is assumed that starting with the current output position, there is enough
289// free space to fit the converted data, plus `PaddingSize` extra bytes of free
290// space, which the encoder is free to fill with garbage.
291template <typename Encoder>
292constexpr std::size_t PaddingSize() {
293 return Encoder::kBlockSize;
294}
295
296template <typename Encoder>
297struct BufferPtr final {
298 char* current{nullptr};
299};
300
301template <typename Encoder>
302USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> AppendBlock(
303 BufferPtr<Encoder> destination,
304 typename Encoder::Block block,
305 std::size_t offset,
306 std::size_t count
307) noexcept {
308 char* const old_current = destination.current;
309 destination.current += count;
310 Encoder::CopyBlock(block, offset, old_current);
311 return destination;
312}
313
314// noinline to avoid code duplication for a cold path
315template <typename Encoder>
316[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValueEach(
317 BufferPtr<Encoder> destination,
318 std::string_view str
319) {
320 for (const char c : str) {
321 destination.current = encoding::EncodeTskv(destination.current, c, EncodeTskvMode::kValue);
322 }
323 return destination;
324}
325
326template <typename Encoder>
327[[nodiscard]] USERVER_IMPL_FORCE_INLINE BufferPtr<Encoder> EncodeValueBlock(
328 BufferPtr<Encoder> destination,
329 const char* block,
330 std::size_t offset,
331 std::size_t count
332) {
333 UASSERT(offset < Encoder::kBlockSize);
334 UASSERT(offset + count <= Encoder::kBlockSize);
335 block = AssumeAligned<Encoder::kBlockSize>(block);
336 const auto block_contents = Encoder::LoadBlock(block);
337
338 if (__builtin_expect(Encoder::MayNeedValueEscaping(block_contents, offset, count), false)) {
339 destination = tskv::EncodeValueEach(destination, std::string_view(block + offset, count));
340 } else {
341 // happy path: the whole block does not need escaping
342 destination = tskv::AppendBlock(destination, block_contents, offset, count);
343 }
344
345 return destination;
346}
347
348// BufferPtr must be passed around by value to avoid aliasing issues.
349template <typename Encoder>
350[[nodiscard]] __attribute__((noinline)) BufferPtr<Encoder> EncodeValue(
351 BufferPtr<Encoder> destination,
352 std::string_view str
353) {
354 if (str.empty()) {
355 return destination;
356 }
357
358 const char* const first_block = AlignDown<Encoder::kBlockSize>(str.data());
359 const auto first_block_offset = static_cast<std::size_t>(str.data() - first_block);
360 const auto first_block_count = std::min(Encoder::kBlockSize - first_block_offset, str.size());
361
362 destination = tskv::EncodeValueBlock(destination, first_block, first_block_offset, first_block_count);
363
364 const char* const last_block = AlignDown<Encoder::kBlockSize>(str.data() + str.size());
365
366 if (last_block != first_block) {
367 for (const char* current_block = first_block + Encoder::kBlockSize; current_block < last_block;
368 current_block += Encoder::kBlockSize)
369 {
370 destination = tskv::EncodeValueBlock(destination, current_block, 0, Encoder::kBlockSize);
371 }
372
373 const auto last_block_count = static_cast<std::size_t>(str.data() + str.size() - last_block);
374 if (last_block_count != 0) {
375 destination = tskv::EncodeValueBlock(destination, last_block, 0, last_block_count);
376 }
377 }
378
379 return destination;
380}
381
382template <typename Encoder>
383[[nodiscard]] BufferPtr<Encoder> DoEncode(BufferPtr<Encoder> destination, std::string_view str, EncodeTskvMode mode) {
384 if (mode == EncodeTskvMode::kValue) {
385 return tskv::EncodeValue(destination, str);
386 } else {
387 for (const char c : str) {
388 destination.current = encoding::EncodeTskv(destination.current, c, mode);
389 }
390 return destination;
391 }
392}
393
394inline std::size_t MaxEncodedSize(std::size_t source_size) noexcept { return source_size * 2; }
395
396template <typename Encoder, typename Container>
397void EncodeFullyBuffered(Container& container, std::string_view str, EncodeTskvMode mode) {
398 const auto old_size = container.size();
399 container.resize(old_size + MaxEncodedSize(str.size()) + PaddingSize<Encoder>());
400 BufferPtr<Encoder> buffer_ptr{container.data() + old_size};
401
402 buffer_ptr = tskv::DoEncode(buffer_ptr, str, mode);
403
404 container.resize(buffer_ptr.current - container.data());
405}
406
407} // namespace impl::tskv
408
409template <typename Container>
410void EncodeTskv(Container& container, std::string_view str, EncodeTskvMode mode) {
411 impl::tskv::EncodeFullyBuffered<impl::tskv::SystemEncoder>(container, str, mode);
412}
413
414/// @cond
415inline bool ShouldKeyBeEscaped(std::string_view key) noexcept {
416 for (const char ch : key) {
417 switch (ch) {
418 case '\t':
419 case '\r':
420 case '\n':
421 case '\0':
422 case '\\':
423 case '.':
424 case '=':
425 return true;
426 default:
427 if ('A' <= ch && ch <= 'Z') {
428 return true;
429 }
430 break;
431 }
432 }
433 return false;
434}
435
436inline bool ShouldValueBeEscaped(std::string_view key) noexcept {
437 using Encoder = impl::tskv::SystemEncoder;
438 const auto block_contents = Encoder::LoadBlock(key.data());
439 return Encoder::MayNeedValueEscaping(block_contents, 0, key.size());
440}
441/// @endcond
442
443} // namespace utils::encoding
444
445USERVER_NAMESPACE_END
446
447#undef USERVER_IMPL_FORCE_INLINE
448#undef USERVER_IMPL_DONT_SANITIZE