RESTinio
percent_encoding.hpp
Go to the documentation of this file.
1/*
2 restinio
3*/
4
9#pragma once
10
11#include <string>
12
14
17#include <restinio/expected.hpp>
18
20
21namespace restinio
22{
23
24namespace utils
25{
26
36{
37 static constexpr bool
38 ordinary_char( char c ) noexcept
39 {
40 return
41 ( '0' <= c && c <= '9' ) ||
42 ( 'a' <= c && c <= 'z' ) ||
43 ( 'A' <= c && c <= 'Z' ) ||
44 '-' == c ||
45 '.' == c ||
46 '~' == c ||
47 '_' == c;
48 }
49};
50
61{
62 static constexpr bool
63 ordinary_char( char c ) noexcept
64 {
65 return
66 ( '0' <= c && c <= '9' ) ||
67 ( 'a' <= c && c <= 'z' ) ||
68 ( 'A' <= c && c <= 'Z' ) ||
69 '*' == c ||
70 '-' == c ||
71 '.' == c ||
72 '_' == c;
73 }
74};
75
97{
98 static bool
99 ordinary_char( char c ) noexcept
100 {
101 return nullptr != std::strchr(
102 " " // Space
103 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // ALPHA
104 "abcdefghijklmnopqrstuvwxyz"
105 "0123456789" // DIGIT
106 "-._~" // unreserved
107 ":/?#[]@" // gen-delims
108 "!$&'()*+,;=", c );
109 }
110};
111
125{
126 static constexpr bool
127 ordinary_char( char c ) noexcept
128 {
129 return
130 ( '0' <= c && c <= '9' ) ||
131 ( 'a' <= c && c <= 'z' ) ||
132 ( 'A' <= c && c <= 'Z' ) ||
133 '-' == c ||
134 '.' == c ||
135 '~' == c ||
136 '_' == c ||
137 '*' == c ||
138 '!' == c ||
139 '\'' == c ||
140 '(' == c ||
141 ')' == c;
142 }
143};
144
152
160{
162 std::string m_description;
163
164public:
166 std::string description )
168 {}
169
172 const std::string &
173 description() const noexcept { return m_description; }
174
176
182 std::string
184};
185
186namespace impl
187{
188
189inline bool
190is_hexdigit( char c )
191{
192 return
193 ( '0' <= c && c <= '9' ) ||
194 ( 'a' <= c && c <= 'f' ) ||
195 ( 'A' <= c && c <= 'F' );
196}
197
198inline char
199extract_escaped_char( char c1, char c2 )
200{
201 char result;
202
203 if( '0' <= c1 && c1 <= '9' )
204 result = c1 - '0';
205 else
206 {
207 c1 |= 0x20;
208 result = 10 + c1 - 'a';
209 }
210
211 result <<= 4;
212
213 if( '0' <= c2 && c2 <= '9' )
214 result += c2 - '0';
215 else
216 {
217 c2 |= 0x20;
218 result += 10 + c2 - 'a';
219 }
220
221 return result;
222}
223
224//
225// do_unescape_percent_encoding
226//
232template<
233 typename Traits,
234 typename Chars_Collector >
240 const string_view_t data,
241 Chars_Collector && collector )
242{
243 std::size_t chars_to_handle = data.size();
244 const char * d = data.data();
245
246 utf8_checker_t utf8_checker;
247 bool expect_next_utf8_byte = false;
248
249 const auto current_pos = [&d, &data]() noexcept { return d - data.data(); };
250
251 while( 0 < chars_to_handle )
252 {
253 char c = *d;
254 if( expect_next_utf8_byte && '%' != c )
255 return make_unexpected( unescape_percent_encoding_failure_t{
256 fmt::format(
257 "next byte from UTF-8 sequence expected at {}",
258 current_pos() )
259 } );
260
261 if( '%' == c )
262 {
263 if( chars_to_handle >= 3 &&
264 is_hexdigit( d[ 1 ] ) &&
265 is_hexdigit( d[ 2 ] ) )
266 {
267 const auto ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
268 if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
269 return make_unexpected( unescape_percent_encoding_failure_t{
270 fmt::format( "invalid UTF-8 sequence detected at {}",
271 current_pos() )
272 } );
273
274 collector( ch );
275 chars_to_handle -= 3;
276 d += 3;
277
278 expect_next_utf8_byte = !utf8_checker.finalized();
279 if( !expect_next_utf8_byte )
280 utf8_checker.reset();
281 }
282 else
283 {
284 return make_unexpected( unescape_percent_encoding_failure_t{
285 fmt::format(
286 "invalid escape sequence at pos {}", current_pos() )
287 } );
288 }
289 }
290 else if( '+' == c )
291 {
292 collector( ' ' );
293 --chars_to_handle;
294 ++d;
295 }
296 else if( Traits::ordinary_char( c ) )
297 {
298 collector( c );
299 --chars_to_handle;
300 ++d;
301 }
302 else
303 {
304 return make_unexpected( unescape_percent_encoding_failure_t{
305 fmt::format(
306 "invalid non-escaped char with code {:#02X} at pos: {}",
307 c,
308 current_pos() )
309 } );
310 }
311 }
312
313 if( expect_next_utf8_byte )
314 return make_unexpected( unescape_percent_encoding_failure_t{
315 fmt::format( "unfinished UTF-8 sequence" )
316 } );
317
319}
320
321} /* namespace impl */
322
325template< typename Traits = restinio_default_unescape_traits >
327std::string
329{
330 std::string result;
331 const auto escaped_chars_count = static_cast<std::size_t>(
332 std::count_if(
333 data.begin(),
334 data.end(),
335 []( auto c ){ return !Traits::ordinary_char(c); } ));
336
337 if( 0 == escaped_chars_count )
338 {
339 // No escaped chars.
340 result.assign( data.data(), data.size() );
341 }
342 else
343 {
344 // Having escaped chars.
345 result.reserve( data.size() + 2*escaped_chars_count );
346 for( auto c : data )
347 {
348 if( Traits::ordinary_char( c ) )
349 result += c;
350 else
351 {
352 result += fmt::format( "%{:02X}", c );
353 }
354 }
355 }
356
357 return result;
358}
359
360template< typename Traits = restinio_default_unescape_traits >
362std::string
364{
365 std::string result;
366 result.reserve( data.size() );
367
368 auto r = impl::do_unescape_percent_encoding<Traits>(
369 data,
370 [&result]( char ch ) { result += ch; } );
371 if( !r )
372 throw exception_t{ r.error().giveout_description() };
373
374 return result;
375}
376
389template< typename Traits = restinio_default_unescape_traits >
393{
394 std::string result;
395 result.reserve( data.size() );
396
397 auto r = impl::do_unescape_percent_encoding<Traits>(
398 data,
399 [&result]( char ch ) { result += ch; } );
400 if( !r )
401 return make_unexpected( std::move(r.error()) );
402
403 return std::move(result);
404}
405
406template< typename Traits = restinio_default_unescape_traits >
408std::size_t
409inplace_unescape_percent_encoding( char * data, std::size_t size )
410{
411 std::size_t result_size = 0u;
412 char * dest = data;
413
414 auto r = impl::do_unescape_percent_encoding<Traits>(
415 string_view_t{ data, size },
416 [&result_size, &dest]( char ch ) {
417 *dest++ = ch;
418 ++result_size;
419 } );
420 if( !r )
421 throw exception_t{ r.error().giveout_description() };
422
423 return result_size;
424}
425
438template< typename Traits = restinio_default_unescape_traits >
441try_inplace_unescape_percent_encoding( char * data, std::size_t size )
442{
443 std::size_t result_size = 0u;
444 char * dest = data;
445
446 auto r = impl::do_unescape_percent_encoding<Traits>(
447 string_view_t{ data, size },
448 [&result_size, &dest]( char ch ) {
449 *dest++ = ch;
450 ++result_size;
451 } );
452 if( !r )
453 return make_unexpected( std::move(r.error()) );
454
455 return result_size;
456}
457
459
460namespace uri_normalization
461{
462
463namespace unreserved_chars
464{
465
466namespace impl
467{
468
477constexpr inline bool
478is_unreserved_char( const char ch ) noexcept
479{
480 // In this version of RESTinio class restinio_default_unescape_traits
481 // already implements necessary check.
483}
484
499template<
500 typename One_Byte_Handler,
501 typename Three_Byte_Handler >
502void
504 string_view_t what,
505 One_Byte_Handler && one_byte_handler,
506 Three_Byte_Handler && three_byte_handler )
507{
508 using namespace restinio::utils::impl;
509
510 std::size_t chars_to_handle = what.size();
511 const char * d = what.data();
512
513 utf8_checker_t utf8_checker;
514 bool expect_next_utf8_byte = false;
515
516 const auto current_pos = [&d, &what]() noexcept { return d - what.data(); };
517
518 while( 0 < chars_to_handle )
519 {
520 if( expect_next_utf8_byte && '%' != *d )
521 throw exception_t{
522 fmt::format( "next byte from UTF-8 sequence expected at {}",
523 current_pos() )
524 };
525
526 if( '%' != *d )
527 {
528 // Just one symbol to the output.
529 one_byte_handler( *d );
530 ++d;
531 --chars_to_handle;
532 }
533 else if( chars_to_handle >= 3 &&
534 is_hexdigit( d[ 1 ] ) && is_hexdigit( d[ 2 ] ) )
535 {
536 const char ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
537 if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
538 throw exception_t{
539 fmt::format( "invalid UTF-8 sequence detected at {}",
540 current_pos() )
541 };
542
543 bool keep_three_bytes = true;
544
545 if( utf8_checker.finalized() )
546 {
547 expect_next_utf8_byte = false;
548
549 const auto symbol = utf8_checker.current_symbol();
550 utf8_checker.reset();
551
552 if( symbol < 0x80u )
553 {
554 const char ascii_char = static_cast<char>(symbol);
555 if( is_unreserved_char( ascii_char ) )
556 {
557 // percent encoded char will be replaced by one char.
558 one_byte_handler( ascii_char );
559 keep_three_bytes = false;
560 }
561 }
562 }
563 else
564 {
565 expect_next_utf8_byte = true;
566 }
567
568 if( keep_three_bytes )
569 {
570 // this part of multi-byte char will go to the output as is.
571 three_byte_handler( d[ 0 ], d[ 1 ], d[ 2 ] );
572 }
573
574 chars_to_handle -= 3;
575 d += 3u;
576 }
577 else
578 {
579 throw exception_t{
580 fmt::format( "invalid escape sequence at pos {}", current_pos() )
581 };
582 }
583 }
584
585 if( expect_next_utf8_byte )
586 throw exception_t{ fmt::format( "unfinished UTF-8 sequence" ) };
587}
588
589} /* namespace impl */
590
604inline std::size_t
606 string_view_t what )
607{
608 std::size_t calculated_capacity = 0u;
609
611 [&calculated_capacity]( char ) noexcept {
612 ++calculated_capacity;
613 },
614 [&calculated_capacity]( char, char, char ) noexcept {
615 calculated_capacity += 3u;
616 } );
617
618 return calculated_capacity;
619}
620
639inline void
641 string_view_t what,
642 char * dest )
643{
645 [&dest]( char ch ) noexcept {
646 *dest++ = ch;
647 },
648 [&dest]( char ch1, char ch2, char ch3 ) noexcept {
649 dest[ 0 ] = ch1;
650 dest[ 1 ] = ch2;
651 dest[ 2 ] = ch3;
652 dest += 3;
653 } );
654}
655
656} /* namespace unreserved_chars */
657
658} /* namespace uri_normalization */
659
660} /* namespace utils */
661
662} /* namespace restinio */
663
Exception class for all exceptions thrown by RESTinio.
Definition: exception.hpp:26
Type that indicates a failure of unescaping of percent-encoded symbols.
RESTINIO_NODISCARD const std::string & description() const noexcept
Get a reference to the description of the failure.
std::string m_description
Description of a failure.
RESTINIO_NODISCARD std::string giveout_description() noexcept
Get out the value of the description of the failure.
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream.
RESTINIO_NODISCARD bool finalized() const noexcept
RESTINIO_NODISCARD bool process_byte(std::uint8_t byte) noexcept
RESTINIO_NODISCARD std::uint32_t current_symbol() const noexcept
#define RESTINIO_NODISCARD
A special wrapper around fmtlib include files.
RESTINIO_NODISCARD auto symbol(char expected) noexcept
A factory function to create a clause that expects the speficied symbol, extracts it and then skips i...
RESTINIO_NODISCARD expected_t< unescape_percent_encoding_success_t, unescape_percent_encoding_failure_t > do_unescape_percent_encoding(const string_view_t data, Chars_Collector &&collector)
The actual implementation of unescape-percent-encoding procedure.
char extract_escaped_char(char c1, char c2)
RESTINIO_NODISCARD constexpr bool is_unreserved_char(const char ch) noexcept
Is this symbol a part of unreserved set?
void run_normalization_algo(string_view_t what, One_Byte_Handler &&one_byte_handler, Three_Byte_Handler &&three_byte_handler)
Internal helper to perform the main logic of enumeration of symbols in URI.
void normalize_to(string_view_t what, char *dest)
Perform normalization of URI value.
RESTINIO_NODISCARD std::size_t estimate_required_capacity(string_view_t what)
Calculate the size of a buffer to hold normalized value of a URI.
RESTINIO_NODISCARD expected_t< std::string, unescape_percent_encoding_failure_t > try_unescape_percent_encoding(const string_view_t data)
Helper function for unescaping percent-encoded string.
RESTINIO_NODISCARD expected_t< std::size_t, unescape_percent_encoding_failure_t > try_inplace_unescape_percent_encoding(char *data, std::size_t size)
Helper function for unescaping percent-encoded string inplace.
RESTINIO_NODISCARD std::size_t inplace_unescape_percent_encoding(char *data, std::size_t size)
RESTINIO_NODISCARD std::string unescape_percent_encoding(const string_view_t data)
RESTINIO_NODISCARD std::string escape_percent_encoding(const string_view_t data)
Percent encoding.
nonstd::string_view string_view_t
Definition: string_view.hpp:19
nonstd::expected< T, E > expected_t
Definition: expected.hpp:22
STL namespace.
The traits for escaping and unexcaping symbols in JavaScript-compatible mode.
static constexpr bool ordinary_char(char c) noexcept
Traits for escaping and unescaping symbols in a query string in very relaxed mode.
static bool ordinary_char(char c) noexcept
The default traits for escaping and unexcaping symbols in a query string.
static constexpr bool ordinary_char(char c) noexcept
Type that indicates that unescaping of percent-encoded symbols completed successfully.
Traits for escaping and unexcaping symbols in a query string in correspondence with application/x-www...
static constexpr bool ordinary_char(char c) noexcept
An implementation of checker for UTF-8 sequences.
#define const
Definition: zconf.h:230