Ada 3.3.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include "ada/checkers-inl.h"
2#include "ada/common_defs.h"
3#include "ada/scheme.h"
4
5#include <cstring>
6#include <sstream>
7
8namespace ada::helpers {
9
10template <typename out_iter>
11void encode_json(std::string_view view, out_iter out) {
12 // trivial implementation. could be faster.
13 const char* hexvalues =
14 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
15 for (uint8_t c : view) {
16 if (c == '\\') {
17 *out++ = '\\';
18 *out++ = '\\';
19 } else if (c == '"') {
20 *out++ = '\\';
21 *out++ = '"';
22 } else if (c <= 0x1f) {
23 *out++ = '\\';
24 *out++ = 'u';
25 *out++ = '0';
26 *out++ = '0';
27 *out++ = hexvalues[2 * c];
28 *out++ = hexvalues[2 * c + 1];
29 } else {
30 *out++ = c;
31 }
32 }
33}
34
36 switch (s) {
38 return "Authority";
40 return "Scheme Start";
42 return "Scheme";
44 return "Host";
46 return "No Scheme";
48 return "Fragment";
50 return "Relative Scheme";
52 return "Relative Slash";
54 return "File";
56 return "File Host";
58 return "File Slash";
60 return "Path or Authority";
62 return "Special Authority Ignore Slashes";
64 return "Special Authority Slashes";
66 return "Special Relative or Authority";
68 return "Query";
70 return "Path";
72 return "Path Start";
74 return "Opaque Path";
76 return "Port";
77 default:
78 return "unknown state";
79 }
80}
81
82ada_really_inline std::optional<std::string_view> prune_hash(
83 std::string_view& input) noexcept {
84 // compiles down to 20--30 instructions including a class to memchr (C
85 // function). this function should be quite fast.
86 size_t location_of_first = input.find('#');
87 if (location_of_first == std::string_view::npos) {
88 return std::nullopt;
89 }
90 std::string_view hash = input;
91 hash.remove_prefix(location_of_first + 1);
92 input.remove_suffix(input.size() - location_of_first);
93 return hash;
94}
95
96ada_really_inline bool shorten_path(std::string& path,
97 ada::scheme::type type) noexcept {
98 // Let path be url's path.
99 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
100 // Windows drive letter, then return.
101 if (type == ada::scheme::type::FILE &&
102 path.find('/', 1) == std::string_view::npos && !path.empty()) {
104 helpers::substring(path, 1))) {
105 return false;
106 }
107 }
108
109 // Remove path's last item, if any.
110 size_t last_delimiter = path.rfind('/');
111 if (last_delimiter != std::string::npos) {
112 path.erase(last_delimiter);
113 return true;
114 }
115
116 return false;
117}
118
119ada_really_inline bool shorten_path(std::string_view& path,
120 ada::scheme::type type) noexcept {
121 // Let path be url's path.
122 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
123 // Windows drive letter, then return.
124 if (type == ada::scheme::type::FILE &&
125 path.find('/', 1) == std::string_view::npos && !path.empty()) {
127 helpers::substring(path, 1))) {
128 return false;
129 }
130 }
131
132 // Remove path's last item, if any.
133 if (!path.empty()) {
134 size_t slash_loc = path.rfind('/');
135 if (slash_loc != std::string_view::npos) {
136 path.remove_suffix(path.size() - slash_loc);
137 return true;
138 }
139 }
140
141 return false;
142}
143
144ada_really_inline void remove_ascii_tab_or_newline(
145 std::string& input) noexcept {
146 // if this ever becomes a performance issue, we could use an approach similar
147 // to has_tabs_or_newline
148 std::erase_if(input, ada::unicode::is_ascii_tab_or_newline);
149}
150
151ada_really_inline constexpr std::string_view substring(std::string_view input,
152 size_t pos) noexcept {
153 ADA_ASSERT_TRUE(pos <= input.size());
154 // The following is safer but unneeded if we have the above line:
155 // return pos > input.size() ? std::string_view() : input.substr(pos);
156 return input.substr(pos);
157}
158
159ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
160 ADA_ASSERT_TRUE(pos <= input.size());
161 input.remove_suffix(input.size() - pos);
162}
163
164// computes the number of trailing zeroes
165// this is a private inline function only defined in this source file.
166ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
167#ifdef ADA_REGULAR_VISUAL_STUDIO
168 unsigned long ret;
169 // Search the mask data from least significant bit (LSB)
170 // to the most significant bit (MSB) for a set bit (1).
171 _BitScanForward(&ret, input_num);
172 return (int)ret;
173#else // ADA_REGULAR_VISUAL_STUDIO
174 return __builtin_ctzl(input_num);
175#endif // ADA_REGULAR_VISUAL_STUDIO
176}
177
178// starting at index location, this finds the next location of a character
179// :, /, \\, ? or [. If none is found, view.size() is returned.
180// For use within get_host_delimiter_location.
181#if ADA_NEON
182// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
183// support direct initialization of uint8x16_t. See
184// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
185#ifndef ada_make_uint8x16_t
186#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
187 x13, x14, x15, x16) \
188 ([=]() { \
189 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
190 x9, x10, x11, x12, x13, x14, x15, x16}; \
191 return vld1q_u8(array); \
192 }())
193#endif
194
196 std::string_view view, size_t location) noexcept {
197 // first check for short strings in which case we do it naively.
198 if (view.size() - location < 16) { // slow path
199 for (size_t i = location; i < view.size(); i++) {
200 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
201 view[i] == '?' || view[i] == '[') {
202 return i;
203 }
204 }
205 return size_t(view.size());
206 }
207 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
208 uint8x16_t bit_mask =
209 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
210 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
211 uint8x16_t minput = vandq_u8(input, bit_mask);
212 uint8x16_t tmp = vpaddq_u8(minput, minput);
213 tmp = vpaddq_u8(tmp, tmp);
214 tmp = vpaddq_u8(tmp, tmp);
215 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
216 };
217
218 // fast path for long strings (expected to be common)
219 size_t i = location;
220 uint8x16_t low_mask =
221 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
222 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
223 uint8x16_t high_mask =
224 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
225 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
226 uint8x16_t fmask = vmovq_n_u8(0xf);
227 uint8x16_t zero{0};
228 for (; i + 15 < view.size(); i += 16) {
229 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
230 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
231 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
232 uint8x16_t classify = vandq_u8(lowpart, highpart);
233 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
234 uint8x16_t is_zero = vceqq_u8(classify, zero);
235 uint16_t is_non_zero = ~to_bitmask(is_zero);
236 return i + trailing_zeroes(is_non_zero);
237 }
238 }
239
240 if (i < view.size()) {
241 uint8x16_t word =
242 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
243 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
244 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
245 uint8x16_t classify = vandq_u8(lowpart, highpart);
246 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
247 uint8x16_t is_zero = vceqq_u8(classify, zero);
248 uint16_t is_non_zero = ~to_bitmask(is_zero);
249 return view.length() - 16 + trailing_zeroes(is_non_zero);
250 }
251 }
252 return size_t(view.size());
253}
254#elif ADA_SSE2
256 std::string_view view, size_t location) noexcept {
257 // first check for short strings in which case we do it naively.
258 if (view.size() - location < 16) { // slow path
259 for (size_t i = location; i < view.size(); i++) {
260 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
261 view[i] == '?' || view[i] == '[') {
262 return i;
263 }
264 }
265 return size_t(view.size());
266 }
267 // fast path for long strings (expected to be common)
268 size_t i = location;
269 const __m128i mask1 = _mm_set1_epi8(':');
270 const __m128i mask2 = _mm_set1_epi8('/');
271 const __m128i mask3 = _mm_set1_epi8('\\');
272 const __m128i mask4 = _mm_set1_epi8('?');
273 const __m128i mask5 = _mm_set1_epi8('[');
274
275 for (; i + 15 < view.size(); i += 16) {
276 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
277 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
278 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
279 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
280 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
281 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
282 __m128i m = _mm_or_si128(
283 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
284 int mask = _mm_movemask_epi8(m);
285 if (mask != 0) {
286 return i + trailing_zeroes(mask);
287 }
288 }
289 if (i < view.size()) {
290 __m128i word =
291 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
292 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
293 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
294 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
295 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
296 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
297 __m128i m = _mm_or_si128(
298 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
299 int mask = _mm_movemask_epi8(m);
300 if (mask != 0) {
301 return view.length() - 16 + trailing_zeroes(mask);
302 }
303 }
304 return size_t(view.length());
305}
306#elif ADA_LSX
308 std::string_view view, size_t location) noexcept {
309 // first check for short strings in which case we do it naively.
310 if (view.size() - location < 16) { // slow path
311 for (size_t i = location; i < view.size(); i++) {
312 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
313 view[i] == '?' || view[i] == '[') {
314 return i;
315 }
316 }
317 return size_t(view.size());
318 }
319 // fast path for long strings (expected to be common)
320 size_t i = location;
321 const __m128i mask1 = __lsx_vrepli_b(':');
322 const __m128i mask2 = __lsx_vrepli_b('/');
323 const __m128i mask3 = __lsx_vrepli_b('\\');
324 const __m128i mask4 = __lsx_vrepli_b('?');
325 const __m128i mask5 = __lsx_vrepli_b('[');
326
327 for (; i + 15 < view.size(); i += 16) {
328 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
329 __m128i m1 = __lsx_vseq_b(word, mask1);
330 __m128i m2 = __lsx_vseq_b(word, mask2);
331 __m128i m3 = __lsx_vseq_b(word, mask3);
332 __m128i m4 = __lsx_vseq_b(word, mask4);
333 __m128i m5 = __lsx_vseq_b(word, mask5);
334 __m128i m =
335 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
336 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
337 if (mask != 0) {
338 return i + trailing_zeroes(mask);
339 }
340 }
341 if (i < view.size()) {
342 __m128i word =
343 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
344 __m128i m1 = __lsx_vseq_b(word, mask1);
345 __m128i m2 = __lsx_vseq_b(word, mask2);
346 __m128i m3 = __lsx_vseq_b(word, mask3);
347 __m128i m4 = __lsx_vseq_b(word, mask4);
348 __m128i m5 = __lsx_vseq_b(word, mask5);
349 __m128i m =
350 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
351 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
352 if (mask != 0) {
353 return view.length() - 16 + trailing_zeroes(mask);
354 }
355 }
356 return size_t(view.length());
357}
358#else
359// : / [ \\ ?
360static constexpr std::array<uint8_t, 256> special_host_delimiters =
361 []() consteval {
362 std::array<uint8_t, 256> result{};
363 for (int i : {':', '/', '[', '\\', '?'}) {
364 result[i] = 1;
365 }
366 return result;
367 }();
368// credit: @the-moisrex recommended a table-based approach
370 std::string_view view, size_t location) noexcept {
371 auto const str = view.substr(location);
372 for (auto pos = str.begin(); pos != str.end(); ++pos) {
373 if (special_host_delimiters[(uint8_t)*pos]) {
374 return pos - str.begin() + location;
375 }
376 }
377 return size_t(view.size());
378}
379#endif
380
381// starting at index location, this finds the next location of a character
382// :, /, ? or [. If none is found, view.size() is returned.
383// For use within get_host_delimiter_location.
384#if ADA_NEON
385ada_really_inline size_t find_next_host_delimiter(std::string_view view,
386 size_t location) noexcept {
387 // first check for short strings in which case we do it naively.
388 if (view.size() - location < 16) { // slow path
389 for (size_t i = location; i < view.size(); i++) {
390 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
391 view[i] == '[') {
392 return i;
393 }
394 }
395 return size_t(view.size());
396 }
397 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
398 uint8x16_t bit_mask =
399 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
400 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
401 uint8x16_t minput = vandq_u8(input, bit_mask);
402 uint8x16_t tmp = vpaddq_u8(minput, minput);
403 tmp = vpaddq_u8(tmp, tmp);
404 tmp = vpaddq_u8(tmp, tmp);
405 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
406 };
407
408 // fast path for long strings (expected to be common)
409 size_t i = location;
410 uint8x16_t low_mask =
411 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
412 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
413 uint8x16_t high_mask =
414 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
415 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
416 uint8x16_t fmask = vmovq_n_u8(0xf);
417 uint8x16_t zero{0};
418 for (; i + 15 < view.size(); i += 16) {
419 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
420 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
421 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
422 uint8x16_t classify = vandq_u8(lowpart, highpart);
423 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
424 uint8x16_t is_zero = vceqq_u8(classify, zero);
425 uint16_t is_non_zero = ~to_bitmask(is_zero);
426 return i + trailing_zeroes(is_non_zero);
427 }
428 }
429
430 if (i < view.size()) {
431 uint8x16_t word =
432 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
433 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
434 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
435 uint8x16_t classify = vandq_u8(lowpart, highpart);
436 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
437 uint8x16_t is_zero = vceqq_u8(classify, zero);
438 uint16_t is_non_zero = ~to_bitmask(is_zero);
439 return view.length() - 16 + trailing_zeroes(is_non_zero);
440 }
441 }
442 return size_t(view.size());
443}
444#elif ADA_SSE2
445ada_really_inline size_t find_next_host_delimiter(std::string_view view,
446 size_t location) noexcept {
447 // first check for short strings in which case we do it naively.
448 if (view.size() - location < 16) { // slow path
449 for (size_t i = location; i < view.size(); i++) {
450 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
451 view[i] == '[') {
452 return i;
453 }
454 }
455 return size_t(view.size());
456 }
457 // fast path for long strings (expected to be common)
458 size_t i = location;
459 const __m128i mask1 = _mm_set1_epi8(':');
460 const __m128i mask2 = _mm_set1_epi8('/');
461 const __m128i mask4 = _mm_set1_epi8('?');
462 const __m128i mask5 = _mm_set1_epi8('[');
463
464 for (; i + 15 < view.size(); i += 16) {
465 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
466 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
467 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
468 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
469 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
470 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
471 int mask = _mm_movemask_epi8(m);
472 if (mask != 0) {
473 return i + trailing_zeroes(mask);
474 }
475 }
476 if (i < view.size()) {
477 __m128i word =
478 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
479 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
480 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
481 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
482 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
483 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
484 int mask = _mm_movemask_epi8(m);
485 if (mask != 0) {
486 return view.length() - 16 + trailing_zeroes(mask);
487 }
488 }
489 return size_t(view.length());
490}
491#elif ADA_LSX
492ada_really_inline size_t find_next_host_delimiter(std::string_view view,
493 size_t location) noexcept {
494 // first check for short strings in which case we do it naively.
495 if (view.size() - location < 16) { // slow path
496 for (size_t i = location; i < view.size(); i++) {
497 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
498 view[i] == '[') {
499 return i;
500 }
501 }
502 return size_t(view.size());
503 }
504 // fast path for long strings (expected to be common)
505 size_t i = location;
506 const __m128i mask1 = __lsx_vrepli_b(':');
507 const __m128i mask2 = __lsx_vrepli_b('/');
508 const __m128i mask4 = __lsx_vrepli_b('?');
509 const __m128i mask5 = __lsx_vrepli_b('[');
510
511 for (; i + 15 < view.size(); i += 16) {
512 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
513 __m128i m1 = __lsx_vseq_b(word, mask1);
514 __m128i m2 = __lsx_vseq_b(word, mask2);
515 __m128i m4 = __lsx_vseq_b(word, mask4);
516 __m128i m5 = __lsx_vseq_b(word, mask5);
517 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
518 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
519 if (mask != 0) {
520 return i + trailing_zeroes(mask);
521 }
522 }
523 if (i < view.size()) {
524 __m128i word =
525 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
526 __m128i m1 = __lsx_vseq_b(word, mask1);
527 __m128i m2 = __lsx_vseq_b(word, mask2);
528 __m128i m4 = __lsx_vseq_b(word, mask4);
529 __m128i m5 = __lsx_vseq_b(word, mask5);
530 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
531 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
532 if (mask != 0) {
533 return view.length() - 16 + trailing_zeroes(mask);
534 }
535 }
536 return size_t(view.length());
537}
538#else
539// : / [ ?
540static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
541 std::array<uint8_t, 256> result{};
542 for (int i : {':', '/', '?', '['}) {
543 result[i] = 1;
544 }
545 return result;
546}();
547// credit: @the-moisrex recommended a table-based approach
548ada_really_inline size_t find_next_host_delimiter(std::string_view view,
549 size_t location) noexcept {
550 auto const str = view.substr(location);
551 for (auto pos = str.begin(); pos != str.end(); ++pos) {
552 if (host_delimiters[(uint8_t)*pos]) {
553 return pos - str.begin() + location;
554 }
555 }
556 return size_t(view.size());
557}
558#endif
559
560ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
561 const bool is_special, std::string_view& view) noexcept {
570 const size_t view_size = view.size();
571 size_t location = 0;
572 bool found_colon = false;
592 if (is_special) {
593 // We move to the next delimiter.
594 location = find_next_host_delimiter_special(view, location);
595 // Unless we find '[' then we are going only going to have to call
596 // find_next_host_delimiter_special once.
597 for (; location < view_size;
598 location = find_next_host_delimiter_special(view, location)) {
599 if (view[location] == '[') {
600 location = view.find(']', location);
601 if (location == std::string_view::npos) {
602 // performance: view.find might get translated to a memchr, which
603 // has no notion of std::string_view::npos, so the code does not
604 // reflect the assembly.
605 location = view_size;
606 break;
607 }
608 } else {
609 found_colon = view[location] == ':';
610 break;
611 }
612 }
613 } else {
614 // We move to the next delimiter.
615 location = find_next_host_delimiter(view, location);
616 // Unless we find '[' then we are going only going to have to call
617 // find_next_host_delimiter_special once.
618 for (; location < view_size;
619 location = find_next_host_delimiter(view, location)) {
620 if (view[location] == '[') {
621 location = view.find(']', location);
622 if (location == std::string_view::npos) {
623 // performance: view.find might get translated to a memchr, which
624 // has no notion of std::string_view::npos, so the code does not
625 // reflect the assembly.
626 location = view_size;
627 break;
628 }
629 } else {
630 found_colon = view[location] == ':';
631 break;
632 }
633 }
634 }
635 // performance: remove_suffix may translate into a single instruction.
636 view.remove_suffix(view_size - location);
637 return {location, found_colon};
638}
639
640void trim_c0_whitespace(std::string_view& input) noexcept {
641 while (!input.empty() &&
642 ada::unicode::is_c0_control_or_space(input.front())) {
643 input.remove_prefix(1);
644 }
645 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
646 input.remove_suffix(1);
647 }
648}
649
650ada_really_inline void parse_prepared_path(std::string_view input,
652 std::string& path) {
653 ada_log("parse_prepared_path ", input);
654 uint8_t accumulator = checkers::path_signature(input);
655 // Let us first detect a trivial case.
656 // If it is special, we check that we have no dot, no %, no \ and no
657 // character needing percent encoding. Otherwise, we check that we have no %,
658 // no dot, and no character needing percent encoding.
659 constexpr uint8_t need_encoding = 1;
660 constexpr uint8_t backslash_char = 2;
661 constexpr uint8_t dot_char = 4;
662 constexpr uint8_t percent_char = 8;
663 bool special = type != ada::scheme::NOT_SPECIAL;
664 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
666 bool trivial_path =
667 (special ? (accumulator == 0)
668 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
669 0)) &&
670 (!may_need_slow_file_handling);
671 if (accumulator == dot_char && !may_need_slow_file_handling) {
672 // '4' means that we have at least one dot, but nothing that requires
673 // percent encoding or decoding. The only part that is not trivial is
674 // that we may have single dots and double dots path segments.
675 // If we have such segments, then we either have a path that begins
676 // with '.' (easy to check), or we have the sequence './'.
677 // Note: input cannot be empty, it must at least contain one character ('.')
678 // Note: we know that '\' is not present.
679 if (input[0] != '.') {
680 size_t slashdot = 0;
681 bool dot_is_file = true;
682 for (;;) {
683 slashdot = input.find("/.", slashdot);
684 if (slashdot == std::string_view::npos) { // common case
685 break;
686 } else { // uncommon
687 // only three cases matter: /./, /.. or a final /
688 slashdot += 2;
689 dot_is_file &= !(slashdot == input.size() || input[slashdot] == '.' ||
690 input[slashdot] == '/');
691 }
692 }
693 trivial_path = dot_is_file;
694 }
695 }
696 if (trivial_path) {
697 ada_log("parse_path trivial");
698 path += '/';
699 path += input;
700 return;
701 }
702 // We are going to need to look a bit at the path, but let us see if we can
703 // ignore percent encoding *and* backslashes *and* percent characters.
704 // Except for the trivial case, this is likely to capture 99% of paths out
705 // there.
706 bool fast_path =
707 (special &&
708 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
709 (type != ada::scheme::type::FILE);
710 if (fast_path) {
711 ada_log("parse_prepared_path fast");
712 // Here we don't need to worry about \ or percent encoding.
713 // We also do not have a file protocol. We might have dots, however,
714 // but dots must as appear as '.', and they cannot be encoded because
715 // the symbol '%' is not present.
716 size_t previous_location = 0; // We start at 0.
717 do {
718 size_t new_location = input.find('/', previous_location);
719 // std::string_view path_view = input;
720 // We process the last segment separately:
721 if (new_location == std::string_view::npos) {
722 std::string_view path_view = input.substr(previous_location);
723 if (path_view == "..") { // The path ends with ..
724 // e.g., if you receive ".." with an empty path, you go to "/".
725 if (path.empty()) {
726 path = '/';
727 return;
728 }
729 // Fast case where we have nothing to do:
730 if (path.back() == '/') {
731 return;
732 }
733 // If you have the path "/joe/myfriend",
734 // then you delete 'myfriend'.
735 path.resize(path.rfind('/') + 1);
736 return;
737 }
738 path += '/';
739 if (path_view != ".") {
740 path.append(path_view);
741 }
742 return;
743 } else {
744 // This is a non-final segment.
745 std::string_view path_view =
746 input.substr(previous_location, new_location - previous_location);
747 previous_location = new_location + 1;
748 if (path_view == "..") {
749 size_t last_delimiter = path.rfind('/');
750 if (last_delimiter != std::string::npos) {
751 path.erase(last_delimiter);
752 }
753 } else if (path_view != ".") {
754 path += '/';
755 path.append(path_view);
756 }
757 }
758 } while (true);
759 } else {
760 ada_log("parse_path slow");
761 // we have reached the general case
762 bool needs_percent_encoding = (accumulator & 1);
763 std::string path_buffer_tmp;
764 do {
765 size_t location = (special && (accumulator & 2))
766 ? input.find_first_of("/\\")
767 : input.find('/');
768 std::string_view path_view = input;
769 if (location != std::string_view::npos) {
770 path_view.remove_suffix(path_view.size() - location);
771 input.remove_prefix(location + 1);
772 }
773 // path_buffer is either path_view or it might point at a percent encoded
774 // temporary file.
775 std::string_view path_buffer =
776 (needs_percent_encoding &&
777 ada::unicode::percent_encode<false>(
778 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
779 ? path_buffer_tmp
780 : path_view;
781 if (unicode::is_double_dot_path_segment(path_buffer)) {
782 helpers::shorten_path(path, type);
783 if (location == std::string_view::npos) {
784 path += '/';
785 }
786 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
787 (location == std::string_view::npos)) {
788 path += '/';
789 }
790 // Otherwise, if path_buffer is not a single-dot path segment, then:
791 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
792 // If url's scheme is "file", url's path is empty, and path_buffer is a
793 // Windows drive letter, then replace the second code point in
794 // path_buffer with U+003A (:).
795 if (type == ada::scheme::type::FILE && path.empty() &&
797 path += '/';
798 path += path_buffer[0];
799 path += ':';
800 path_buffer.remove_prefix(2);
801 path.append(path_buffer);
802 } else {
803 // Append path_buffer to url's path.
804 path += '/';
805 path.append(path_buffer);
806 }
807 }
808 if (location == std::string_view::npos) {
809 return;
810 }
811 } while (true);
812 }
813}
814
815bool overlaps(std::string_view input1, const std::string& input2) noexcept {
816 ada_log("helpers::overlaps check if string_view '", input1, "' [",
817 input1.size(), " bytes] is part of string '", input2, "' [",
818 input2.size(), " bytes]");
819 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
820 input1.data() < input2.data() + input2.size();
821}
822
823template <class url_type>
824ada_really_inline void strip_trailing_spaces_from_opaque_path(
825 url_type& url) noexcept {
826 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
827 if (!url.has_opaque_path) return;
828 if (url.has_hash()) return;
829 if (url.has_search()) return;
830
831 auto path = std::string(url.get_pathname());
832 while (!path.empty() && path.back() == ' ') {
833 path.resize(path.size() - 1);
834 }
835 url.update_base_pathname(path);
836}
837
838// @ / \\ ?
839static constexpr std::array<uint8_t, 256> authority_delimiter_special =
840 []() consteval {
841 std::array<uint8_t, 256> result{};
842 for (uint8_t i : {'@', '/', '\\', '?'}) {
843 result[i] = 1;
844 }
845 return result;
846 }();
847// credit: @the-moisrex recommended a table-based approach
849find_authority_delimiter_special(std::string_view view) noexcept {
850 // performance note: we might be able to gain further performance
851 // with SIMD instrinsics.
852 for (auto pos = view.begin(); pos != view.end(); ++pos) {
853 if (authority_delimiter_special[(uint8_t)*pos]) {
854 return pos - view.begin();
855 }
856 }
857 return size_t(view.size());
858}
859
860// @ / ?
861static constexpr std::array<uint8_t, 256> authority_delimiter = []() consteval {
862 std::array<uint8_t, 256> result{};
863 for (uint8_t i : {'@', '/', '?'}) {
864 result[i] = 1;
865 }
866 return result;
867}();
868// credit: @the-moisrex recommended a table-based approach
870find_authority_delimiter(std::string_view view) noexcept {
871 // performance note: we might be able to gain further performance
872 // with SIMD instrinsics.
873 for (auto pos = view.begin(); pos != view.end(); ++pos) {
874 if (authority_delimiter[(uint8_t)*pos]) {
875 return pos - view.begin();
876 }
877 }
878 return size_t(view.size());
879}
880
881} // namespace ada::helpers
882
883namespace ada {
887#undef ada_make_uint8x16_t
888} // namespace ada
Definitions for URL specific checkers used within Ada.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:84
#define ada_warn_unused
Definition common_defs.h:85
#define ada_really_inline
Definition common_defs.h:81
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:548
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:839
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:540
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:369
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:35
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:861
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:360
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:166
@ NOT_SPECIAL
Definition scheme.h:30
Definition ada_idna.h:13
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:91
@ FILE_SLASH
Definition state.h:71
@ SCHEME
Definition state.h:31
@ QUERY
Definition state.h:96
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:86
@ FRAGMENT
Definition state.h:46
@ FILE_HOST
Definition state.h:66
@ OPAQUE_PATH
Definition state.h:111
@ RELATIVE_SLASH
Definition state.h:56
@ NO_SCHEME
Definition state.h:41
@ PATH_START
Definition state.h:106
@ RELATIVE_SCHEME
Definition state.h:51
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:81
@ SCHEME_START
Definition state.h:26
@ AUTHORITY
Definition state.h:21
@ PATH_OR_AUTHORITY
Definition state.h:76
ada_warn_unused std::string_view to_string(encoding_type type)
tl::expected< result_type, ada::errors > result
Declarations for the URL scheme.