ICU 78.2 78.2
Loading...
Searching...
No Matches
utfiterator.h
Go to the documentation of this file.
1// © 2024 and later: Unicode, Inc. and others.
2// License & terms of use: https://www.unicode.org/copyright.html
3
4// utfiterator.h
5// created: 2024aug12 Markus W. Scherer
6
7#ifndef __UTFITERATOR_H__
8#define __UTFITERATOR_H__
9
10#include "unicode/utypes.h"
11
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13
14#include <iterator>
15#if defined(__cpp_lib_ranges)
16#include <ranges>
17#endif
18#include <string>
19#include <string_view>
20#include <type_traits>
21#include "unicode/utf16.h"
22#include "unicode/utf8.h"
23#include "unicode/uversion.h"
24
134
135#ifndef U_HIDE_DRAFT_API
136
169
170namespace U_HEADER_ONLY_NAMESPACE {
171
172namespace prv {
173#if U_CPLUSPLUS_VERSION >= 20
174
176template<typename Iter>
177using iter_value_t = typename std::iter_value_t<Iter>;
178
180template<typename Iter>
181using iter_difference_t = std::iter_difference_t<Iter>;
182
184template<typename Iter>
185constexpr bool forward_iterator = std::forward_iterator<Iter>;
186
188template<typename Iter>
189constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190
192template<typename Range>
193constexpr bool range = std::ranges::range<Range>;
194
195#else
196
198template<typename Iter>
199using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200
202template<typename Iter>
203using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204
206template<typename Iter>
207constexpr bool forward_iterator =
208 std::is_base_of_v<
209 std::forward_iterator_tag,
210 typename std::iterator_traits<Iter>::iterator_category>;
211
213template<typename Iter>
215 std::is_base_of_v<
216 std::bidirectional_iterator_tag,
217 typename std::iterator_traits<Iter>::iterator_category>;
218
220template<typename Range, typename = void>
221struct range_type : std::false_type {};
222
224template<typename Range>
226 Range,
227 std::void_t<decltype(std::declval<Range>().begin()),
228 decltype(std::declval<Range>().end())>> : std::true_type {};
229
231template<typename Range>
233
234#endif
235
237template <typename T> struct is_basic_string_view : std::false_type {};
238
240template <typename... Args>
241struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
242
244template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
245
247template<typename CP32, bool skipSurrogates>
249 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
250public:
252 using value_type = CP32;
256 using pointer = CP32 *;
258 using difference_type = int32_t;
260 using iterator_category = std::forward_iterator_tag;
261
263 inline CodePointsIterator(CP32 c) : c_(c) {}
265 inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
267 inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
269 inline CP32 operator*() const { return c_; }
271 inline CodePointsIterator &operator++() { // pre-increment
272 ++c_;
273 if (skipSurrogates && c_ == 0xd800) {
274 c_ = 0xe000;
275 }
276 return *this;
277 }
278
279 inline CodePointsIterator operator++(int) { // post-increment
280 CodePointsIterator result(*this);
281 ++(*this);
282 return result;
283 }
284
285private:
286 CP32 c_;
287};
288
289} // namespace prv
290
301template<typename CP32>
303 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
304public:
312 auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
317 auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
318};
319
332template<typename CP32>
334 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
335public:
343 auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
348 auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
349};
350
366template<typename CP32, typename UnitIter, typename = void>
368 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
369 using Unit = typename prv::iter_value_t<UnitIter>;
370public:
372 UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
373 c_(codePoint), len_(length), start_(start), limit_(limit) {}
374
376 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
378 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
379
387 CP32 codePoint() const { return c_; }
388
394 UnitIter begin() const { return start_; }
395
401 UnitIter end() const { return limit_; }
402
407 uint8_t length() const { return len_; }
408
409#if U_CPLUSPLUS_VERSION >= 20
415 template<std::contiguous_iterator Iter = UnitIter>
416 std::basic_string_view<Unit> stringView() const {
417 return std::basic_string_view<Unit>(begin(), end());
418 }
419#else
425 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
426 std::enable_if_t<std::is_pointer_v<Iter> ||
427 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
428 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
429 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
430 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
431 std::basic_string_view<Unit>>
432 stringView() const {
433 return std::basic_string_view<Unit>(&*start_, len_);
434 }
435#endif
436
437private:
438 // Order of fields with padding and access frequency in mind.
439 CP32 c_;
440 uint8_t len_;
441 UnitIter start_;
442 UnitIter limit_;
443};
444
445#ifndef U_IN_DOXYGEN
446// Partial template specialization for single-pass input iterator.
447// No UnitIter field, no getter for it, no stringView().
448template<typename CP32, typename UnitIter>
449class UnsafeCodeUnits<
450 CP32,
451 UnitIter,
452 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
453 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
454public:
455 UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
456
457 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
458 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
459
460 CP32 codePoint() const { return c_; }
461
462 uint8_t length() const { return len_; }
463
464private:
465 // Order of fields with padding and access frequency in mind.
466 CP32 c_;
467 uint8_t len_;
468};
469#endif // U_IN_DOXYGEN
470
486template<typename CP32, typename UnitIter, typename = void>
487class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
488public:
490 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
491 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
492
494 CodeUnits(const CodeUnits &other) = default;
496 CodeUnits &operator=(const CodeUnits &other) = default;
497
502 bool wellFormed() const { return ok_; }
503
504private:
505 bool ok_;
506};
507
508#ifndef U_IN_DOXYGEN
509// Partial template specialization for single-pass input iterator.
510// No UnitIter field, no getter for it, no stringView().
511template<typename CP32, typename UnitIter>
512class CodeUnits<
513 CP32,
514 UnitIter,
515 std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
516 public UnsafeCodeUnits<CP32, UnitIter> {
517public:
518 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
519 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
520
521 CodeUnits(const CodeUnits &other) = default;
522 CodeUnits &operator=(const CodeUnits &other) = default;
523
524 bool wellFormed() const { return ok_; }
525
526private:
527 bool ok_;
528};
529#endif // U_IN_DOXYGEN
530
531// Validating implementations ---------------------------------------------- ***
532
533#ifndef U_IN_DOXYGEN
534template<typename CP32, UTFIllFormedBehavior behavior,
535 typename UnitIter, typename LimitIter = UnitIter, typename = void>
536class UTFImpl;
537
538// Note: readAndInc() functions take both a p0 and a p iterator.
539// They must have the same value.
540// For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
541// and readAndInc() copies p0 and the incremented p into the CodeUnits.
542// For a single-pass UnitIter, which may not be default-constructible nor coypable,
543// the caller can pass p into both references, and readAndInc() does not use p0
544// and constructs CodeUnits without them.
545// Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
546// which may not be possible for a single-pass iterator.
547
548// UTF-8
549template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
550class UTFImpl<
551 CP32, behavior,
552 UnitIter, LimitIter,
553 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
554 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
555 static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
556 "For 8-bit strings, the SURROGATE option does not have an equivalent.");
557public:
558 // Handle ill-formed UTF-8
559 U_FORCE_INLINE static CP32 sub() {
560 switch (behavior) {
562 case UTF_BEHAVIOR_FFFD: return 0xfffd;
563 }
564 }
565
566 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
567 // Very similar to U8_FWD_1().
568 uint8_t b = *p;
569 ++p;
570 if (U8_IS_LEAD(b) && p != limit) {
571 uint8_t t1 = *p;
572 if ((0xe0 <= b && b < 0xf0)) {
573 if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
574 ++p != limit && U8_IS_TRAIL(*p)) {
575 ++p;
576 }
577 } else if (b < 0xe0) {
578 if (U8_IS_TRAIL(t1)) {
579 ++p;
580 }
581 } else /* b >= 0xf0 */ {
582 if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
583 ++p != limit && U8_IS_TRAIL(*p) &&
584 ++p != limit && U8_IS_TRAIL(*p)) {
585 ++p;
586 }
587 }
588 }
589 }
590
591 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
592 // Very similar to U8_BACK_1().
593 uint8_t c = *--p;
594 if (U8_IS_TRAIL(c) && p != start) {
595 UnitIter p1 = p;
596 uint8_t b1 = *--p1;
597 if (U8_IS_LEAD(b1)) {
598 if (b1 < 0xe0 ||
599 (b1 < 0xf0 ?
601 U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
602 p = p1;
603 return;
604 }
605 } else if (U8_IS_TRAIL(b1) && p1 != start) {
606 uint8_t b2 = *--p1;
607 if (0xe0 <= b2 && b2 <= 0xf4) {
608 if (b2 < 0xf0 ?
610 U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
611 p = p1;
612 return;
613 }
614 } else if (U8_IS_TRAIL(b2) && p1 != start) {
615 uint8_t b3 = *--p1;
616 if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
617 p = p1;
618 return;
619 }
620 }
621 }
622 }
623 }
624
625 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
626 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
627 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
628 // Very similar to U8_NEXT_OR_FFFD().
629 CP32 c = uint8_t(*p);
630 ++p;
631 if (U8_IS_SINGLE(c)) {
632 if constexpr (isMultiPass) {
633 return {c, 1, true, p0, p};
634 } else {
635 return {c, 1, true};
636 }
637 }
638 uint8_t length = 1;
639 uint8_t t = 0;
640 if (p != limit &&
641 // fetch/validate/assemble all but last trail byte
642 (c >= 0xe0 ?
643 (c < 0xf0 ? // U+0800..U+FFFF except surrogates
644 U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
645 (t &= 0x3f, 1)
646 : // U+10000..U+10FFFF
647 (c -= 0xf0) <= 4 &&
648 U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
649 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
650 (t = *p - 0x80) <= 0x3f) &&
651 // valid second-to-last trail byte
652 (c = (c << 6) | t, ++length, ++p != limit)
653 : // U+0080..U+07FF
654 c >= 0xc2 && (c &= 0x1f, 1)) &&
655 // last trail byte
656 (t = *p - 0x80) <= 0x3f) {
657 c = (c << 6) | t;
658 ++length;
659 ++p;
660 if constexpr (isMultiPass) {
661 return {c, length, true, p0, p};
662 } else {
663 return {c, length, true};
664 }
665 }
666 if constexpr (isMultiPass) {
667 return {sub(), length, false, p0, p};
668 } else {
669 return {sub(), length, false};
670 }
671 }
672
673 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
674 // Very similar to U8_PREV_OR_FFFD().
675 UnitIter p0 = p;
676 CP32 c = uint8_t(*--p);
677 if (U8_IS_SINGLE(c)) {
678 return {c, 1, true, p, p0};
679 }
680 if (U8_IS_TRAIL(c) && p != start) {
681 UnitIter p1 = p;
682 uint8_t b1 = *--p1;
683 if (U8_IS_LEAD(b1)) {
684 if (b1 < 0xe0) {
685 p = p1;
686 c = ((b1 - 0xc0) << 6) | (c & 0x3f);
687 return {c, 2, true, p, p0};
688 } else if (b1 < 0xf0 ?
691 // Truncated 3- or 4-byte sequence.
692 p = p1;
693 return {sub(), 2, false, p, p0};
694 }
695 } else if (U8_IS_TRAIL(b1) && p1 != start) {
696 // Extract the value bits from the last trail byte.
697 c &= 0x3f;
698 uint8_t b2 = *--p1;
699 if (0xe0 <= b2 && b2 <= 0xf4) {
700 if (b2 < 0xf0) {
701 b2 &= 0xf;
702 if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
703 p = p1;
704 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
705 return {c, 3, true, p, p0};
706 }
707 } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
708 // Truncated 4-byte sequence.
709 p = p1;
710 return {sub(), 3, false, p, p0};
711 }
712 } else if (U8_IS_TRAIL(b2) && p1 != start) {
713 uint8_t b3 = *--p1;
714 if (0xf0 <= b3 && b3 <= 0xf4) {
715 b3 &= 7;
716 if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
717 p = p1;
718 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
719 return {c, 4, true, p, p0};
720 }
721 }
722 }
723 }
724 }
725 return {sub(), 1, false, p, p0};
726 }
727};
728
729// UTF-16
730template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
731class UTFImpl<
732 CP32, behavior,
733 UnitIter, LimitIter,
734 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
735 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
736public:
737 // Handle ill-formed UTF-16: One unpaired surrogate.
738 U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
739 switch (behavior) {
741 case UTF_BEHAVIOR_FFFD: return 0xfffd;
742 case UTF_BEHAVIOR_SURROGATE: return surrogate;
743 }
744 }
745
746 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
747 // Very similar to U16_FWD_1().
748 auto c = *p;
749 ++p;
750 if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
751 ++p;
752 }
753 }
754
755 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
756 // Very similar to U16_BACK_1().
757 UnitIter p1;
758 if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
759 p = p1;
760 }
761 }
762
763 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
764 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
765 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
766 // Very similar to U16_NEXT_OR_FFFD().
767 CP32 c = static_cast<CP32>(*p);
768 ++p;
769 if (!U16_IS_SURROGATE(c)) {
770 if constexpr (isMultiPass) {
771 return {c, 1, true, p0, p};
772 } else {
773 return {c, 1, true};
774 }
775 } else {
776 uint16_t c2;
777 if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
778 ++p;
779 c = U16_GET_SUPPLEMENTARY(c, c2);
780 if constexpr (isMultiPass) {
781 return {c, 2, true, p0, p};
782 } else {
783 return {c, 2, true};
784 }
785 } else {
786 if constexpr (isMultiPass) {
787 return {sub(c), 1, false, p0, p};
788 } else {
789 return {sub(c), 1, false};
790 }
791 }
792 }
793 }
794
795 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
796 // Very similar to U16_PREV_OR_FFFD().
797 UnitIter p0 = p;
798 CP32 c = static_cast<CP32>(*--p);
799 if (!U16_IS_SURROGATE(c)) {
800 return {c, 1, true, p, p0};
801 } else {
802 UnitIter p1;
803 uint16_t c2;
804 if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
805 p = p1;
806 c = U16_GET_SUPPLEMENTARY(c2, c);
807 return {c, 2, true, p, p0};
808 } else {
809 return {sub(c), 1, false, p, p0};
810 }
811 }
812 }
813};
814
815// UTF-32: trivial, but still validating
816template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
817class UTFImpl<
818 CP32, behavior,
819 UnitIter, LimitIter,
820 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
821 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
822public:
823 // Handle ill-formed UTF-32
824 U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
825 switch (behavior) {
827 case UTF_BEHAVIOR_FFFD: return 0xfffd;
828 case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
829 }
830 }
831
832 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
833 ++p;
834 }
835
836 U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
837 --p;
838 }
839
840 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
841 UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
842 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
843 uint32_t uc = *p;
844 CP32 c = uc;
845 ++p;
846 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
847 if constexpr (isMultiPass) {
848 return {c, 1, true, p0, p};
849 } else {
850 return {c, 1, true};
851 }
852 } else {
853 if constexpr (isMultiPass) {
854 return {sub(uc < 0xe000, c), 1, false, p0, p};
855 } else {
856 return {sub(uc < 0xe000, c), 1, false};
857 }
858 }
859 }
860
861 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
862 UnitIter p0 = p;
863 uint32_t uc = *--p;
864 CP32 c = uc;
865 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
866 return {c, 1, true, p, p0};
867 } else {
868 return {sub(uc < 0xe000, c), 1, false, p, p0};
869 }
870 }
871};
872
873// Non-validating implementations ------------------------------------------ ***
874
875template<typename CP32, typename UnitIter, typename = void>
876class UnsafeUTFImpl;
877
878// UTF-8
879template<typename CP32, typename UnitIter>
880class UnsafeUTFImpl<
881 CP32,
882 UnitIter,
883 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
884 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
885public:
886 U_FORCE_INLINE static void inc(UnitIter &p) {
887 // Very similar to U8_FWD_1_UNSAFE().
888 uint8_t b = *p;
889 std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
890 }
891
892 U_FORCE_INLINE static void dec(UnitIter &p) {
893 // Very similar to U8_BACK_1_UNSAFE().
894 while (U8_IS_TRAIL(*--p)) {}
895 }
896
897 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
898 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
899 // Very similar to U8_NEXT_UNSAFE().
900 CP32 c = uint8_t(*p);
901 ++p;
902 if (U8_IS_SINGLE(c)) {
903 if constexpr (isMultiPass) {
904 return {c, 1, p0, p};
905 } else {
906 return {c, 1};
907 }
908 } else if (c < 0xe0) {
909 c = ((c & 0x1f) << 6) | (*p & 0x3f);
910 ++p;
911 if constexpr (isMultiPass) {
912 return {c, 2, p0, p};
913 } else {
914 return {c, 2};
915 }
916 } else if (c < 0xf0) {
917 // No need for (c&0xf) because the upper bits are truncated
918 // after <<12 in the cast to uint16_t.
919 c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
920 ++p;
921 c |= *p & 0x3f;
922 ++p;
923 if constexpr (isMultiPass) {
924 return {c, 3, p0, p};
925 } else {
926 return {c, 3};
927 }
928 } else {
929 c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
930 ++p;
931 c |= (*p & 0x3f) << 6;
932 ++p;
933 c |= *p & 0x3f;
934 ++p;
935 if constexpr (isMultiPass) {
936 return {c, 4, p0, p};
937 } else {
938 return {c, 4};
939 }
940 }
941 }
942
943 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
944 // Very similar to U8_PREV_UNSAFE().
945 UnitIter p0 = p;
946 CP32 c = uint8_t(*--p);
947 if (U8_IS_SINGLE(c)) {
948 return {c, 1, p, p0};
949 }
950 // U8_IS_TRAIL(c) if well-formed
951 c &= 0x3f;
952 uint8_t count = 1;
953 for (uint8_t shift = 6;;) {
954 uint8_t b = *--p;
955 if (b >= 0xc0) {
956 U8_MASK_LEAD_BYTE(b, count);
957 c |= uint32_t{b} << shift;
958 break;
959 } else {
960 c |= (uint32_t{b} & 0x3f) << shift;
961 ++count;
962 shift += 6;
963 }
964 }
965 ++count;
966 return {c, count, p, p0};
967 }
968};
969
970// UTF-16
971template<typename CP32, typename UnitIter>
972class UnsafeUTFImpl<
973 CP32,
974 UnitIter,
975 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
976 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
977public:
978 U_FORCE_INLINE static void inc(UnitIter &p) {
979 // Very similar to U16_FWD_1_UNSAFE().
980 auto c = *p;
981 ++p;
982 if (U16_IS_LEAD(c)) {
983 ++p;
984 }
985 }
986
987 U_FORCE_INLINE static void dec(UnitIter &p) {
988 // Very similar to U16_BACK_1_UNSAFE().
989 if (U16_IS_TRAIL(*--p)) {
990 --p;
991 }
992 }
993
994 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
995 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
996 // Very similar to U16_NEXT_UNSAFE().
997 CP32 c = static_cast<CP32>(*p);
998 ++p;
999 if (!U16_IS_LEAD(c)) {
1000 if constexpr (isMultiPass) {
1001 return {c, 1, p0, p};
1002 } else {
1003 return {c, 1};
1004 }
1005 } else {
1006 uint16_t c2 = *p;
1007 ++p;
1008 c = U16_GET_SUPPLEMENTARY(c, c2);
1009 if constexpr (isMultiPass) {
1010 return {c, 2, p0, p};
1011 } else {
1012 return {c, 2};
1013 }
1014 }
1015 }
1016
1017 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1018 // Very similar to U16_PREV_UNSAFE().
1019 UnitIter p0 = p;
1020 CP32 c = static_cast<CP32>(*--p);
1021 if (!U16_IS_TRAIL(c)) {
1022 return {c, 1, p, p0};
1023 } else {
1024 uint16_t c2 = *--p;
1025 c = U16_GET_SUPPLEMENTARY(c2, c);
1026 return {c, 2, p, p0};
1027 }
1028 }
1029};
1030
1031// UTF-32: trivial
1032template<typename CP32, typename UnitIter>
1033class UnsafeUTFImpl<
1034 CP32,
1035 UnitIter,
1036 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1037 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1038public:
1039 U_FORCE_INLINE static void inc(UnitIter &p) {
1040 ++p;
1041 }
1042
1043 U_FORCE_INLINE static void dec(UnitIter &p) {
1044 --p;
1045 }
1046
1047 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1048 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1049 CP32 c = *p;
1050 ++p;
1051 if constexpr (isMultiPass) {
1052 return {c, 1, p0, p};
1053 } else {
1054 return {c, 1};
1055 }
1056 }
1057
1058 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1059 UnitIter p0 = p;
1060 CP32 c = *--p;
1061 return {c, 1, p, p0};
1062 }
1063};
1064
1065#endif
1066
1067// Validating iterators ---------------------------------------------------- ***
1068
1092template<typename CP32, UTFIllFormedBehavior behavior,
1093 typename UnitIter, typename LimitIter = UnitIter, typename = void>
1095 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1096 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1097
1098 // Proxy type for operator->() (required by LegacyInputIterator)
1099 // so that we don't promise always returning CodeUnits.
1100 class Proxy {
1101 public:
1102 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1103 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1104 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1105 private:
1107 };
1108
1109public:
1115 using pointer = Proxy;
1119 using iterator_category = std::conditional_t<
1121 std::bidirectional_iterator_tag,
1122 std::forward_iterator_tag>;
1123
1137 U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1138 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1139
1150 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1151 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1152
1163 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1169 U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1170
1172 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1175
1177 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1180
1186 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1187 return getLogicalPosition() == other.getLogicalPosition();
1188 }
1189
1194 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1195
1196 // Asymmetric equality & nonequality with a sentinel type.
1197
1204 template<typename Sentinel> U_FORCE_INLINE friend
1205 std::enable_if_t<
1206 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1207 bool>
1208 operator==(const UTFIterator &iter, const Sentinel &s) {
1209 return iter.getLogicalPosition() == s;
1210 }
1211
1212#if U_CPLUSPLUS_VERSION < 20
1213 // C++17: Need to define all four combinations of == / != vs. parameter order.
1214 // Once we require C++20, we could remove all but the first == because
1215 // the compiler would generate the rest.
1216
1223 template<typename Sentinel> U_FORCE_INLINE friend
1224 std::enable_if_t<
1225 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1226 bool>
1227 operator==(const Sentinel &s, const UTFIterator &iter) {
1228 return iter.getLogicalPosition() == s;
1229 }
1230
1236 template<typename Sentinel> U_FORCE_INLINE friend
1237 std::enable_if_t<
1238 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1239 bool>
1240 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1247 template<typename Sentinel> U_FORCE_INLINE friend
1248 std::enable_if_t<
1249 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1250 bool>
1251 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1252#endif // C++17
1253
1261 if (state_ == 0) {
1262 UnitIter p0 = p_;
1263 units_ = Impl::readAndInc(p0, p_, limit_);
1264 state_ = 1;
1265 }
1266 return units_;
1267 }
1268
1278 if (state_ == 0) {
1279 UnitIter p0 = p_;
1280 units_ = Impl::readAndInc(p0, p_, limit_);
1281 state_ = 1;
1282 }
1283 return Proxy(units_);
1284 }
1285
1293 if (state_ > 0) {
1294 // operator*() called readAndInc() so p_ is already ahead.
1295 state_ = 0;
1296 } else if (state_ == 0) {
1297 Impl::inc(p_, limit_);
1298 } else /* state_ < 0 */ {
1299 // operator--() called decAndRead() so we know how far to skip.
1300 p_ = units_.end();
1301 state_ = 0;
1302 }
1303 return *this;
1304 }
1305
1314 U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1315 if (state_ > 0) {
1316 // operator*() called readAndInc() so p_ is already ahead.
1317 UTFIterator result(*this);
1318 state_ = 0;
1319 return result;
1320 } else if (state_ == 0) {
1321 UnitIter p0 = p_;
1322 units_ = Impl::readAndInc(p0, p_, limit_);
1323 UTFIterator result(*this);
1324 result.state_ = 1;
1325 // keep this->state_ == 0
1326 return result;
1327 } else /* state_ < 0 */ {
1328 UTFIterator result(*this);
1329 // operator--() called decAndRead() so we know how far to skip.
1330 p_ = units_.end();
1331 state_ = 0;
1332 return result;
1333 }
1334 }
1335
1343 template<typename Iter = UnitIter>
1345 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1346 operator--() { // pre-decrement
1347 if (state_ > 0) {
1348 // operator*() called readAndInc() so p_ is ahead of the logical position.
1349 p_ = units_.begin();
1350 }
1351 units_ = Impl::decAndRead(start_, p_);
1352 state_ = -1;
1353 return *this;
1354 }
1355
1363 template<typename Iter = UnitIter>
1365 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1366 operator--(int) { // post-decrement
1367 UTFIterator result(*this);
1368 operator--();
1369 return result;
1370 }
1371
1372private:
1373 friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1374
1375 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1376 return state_ <= 0 ? p_ : units_.begin();
1377 }
1378
1379 // operator*() etc. are logically const.
1380 mutable UnitIter p_;
1381 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1382 // (forward or backward) we can test if there are enough code units.
1383 UnitIter start_;
1384 LimitIter limit_;
1385 // Keep state so that we call readAndInc() only once for both operator*() and ++
1386 // to make it easy for the compiler to optimize.
1387 mutable CodeUnits<CP32, UnitIter> units_;
1388 // >0: units_ = readAndInc(), p_ = units limit
1389 // which means that p_ is ahead of its logical position
1390 // 0: initial state
1391 // <0: units_ = decAndRead(), p_ = units start
1392 mutable int8_t state_ = 0;
1393};
1394
1395#ifndef U_IN_DOXYGEN
1396// Partial template specialization for single-pass input iterator.
1397template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1398class UTFIterator<
1399 CP32, behavior,
1400 UnitIter, LimitIter,
1401 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1402 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1403 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1404
1405 // Proxy type for post-increment return value, to make *iter++ work.
1406 // Also for operator->() (required by LegacyInputIterator)
1407 // so that we don't promise always returning CodeUnits.
1408 class Proxy {
1409 public:
1410 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1411 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1412 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1413 private:
1414 CodeUnits<CP32, UnitIter> units_;
1415 };
1416
1417public:
1418 using value_type = CodeUnits<CP32, UnitIter>;
1419 using reference = value_type;
1420 using pointer = Proxy;
1421 using difference_type = prv::iter_difference_t<UnitIter>;
1422 using iterator_category = std::input_iterator_tag;
1423
1424 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1425
1426 // Constructs an iterator start or limit sentinel.
1427 // Requires p to be copyable.
1428 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1429
1430 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1431 U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1432
1433 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1434 U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1435
1436 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1437 return p_ == other.p_ && ahead_ == other.ahead_;
1438 // Strictly speaking, we should check if the logical position is the same.
1439 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1440 }
1441 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1442
1443 template<typename Sentinel> U_FORCE_INLINE friend
1444 std::enable_if_t<
1445 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1446 bool>
1447 operator==(const UTFIterator &iter, const Sentinel &s) {
1448 return !iter.ahead_ && iter.p_ == s;
1449 }
1450
1451#if U_CPLUSPLUS_VERSION < 20
1452 template<typename Sentinel> U_FORCE_INLINE friend
1453 std::enable_if_t<
1454 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1455 bool>
1456 operator==(const Sentinel &s, const UTFIterator &iter) {
1457 return !iter.ahead_ && iter.p_ == s;
1458 }
1459
1460 template<typename Sentinel> U_FORCE_INLINE friend
1461 std::enable_if_t<
1462 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1463 bool>
1464 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1465
1466 template<typename Sentinel> U_FORCE_INLINE friend
1467 std::enable_if_t<
1468 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1469 bool>
1470 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1471#endif // C++17
1472
1473 U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1474 if (!ahead_) {
1475 units_ = Impl::readAndInc(p_, p_, limit_);
1476 ahead_ = true;
1477 }
1478 return units_;
1479 }
1480
1481 U_FORCE_INLINE Proxy operator->() const {
1482 if (!ahead_) {
1483 units_ = Impl::readAndInc(p_, p_, limit_);
1484 ahead_ = true;
1485 }
1486 return Proxy(units_);
1487 }
1488
1489 U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1490 if (ahead_) {
1491 // operator*() called readAndInc() so p_ is already ahead.
1492 ahead_ = false;
1493 } else {
1494 Impl::inc(p_, limit_);
1495 }
1496 return *this;
1497 }
1498
1499 U_FORCE_INLINE Proxy operator++(int) { // post-increment
1500 if (ahead_) {
1501 // operator*() called readAndInc() so p_ is already ahead.
1502 ahead_ = false;
1503 } else {
1504 units_ = Impl::readAndInc(p_, p_, limit_);
1505 // keep this->ahead_ == false
1506 }
1507 return Proxy(units_);
1508 }
1509
1510private:
1511 // operator*() etc. are logically const.
1512 mutable UnitIter p_;
1513 // In a validating iterator, we need limit_ so that when we read a code point
1514 // we can test if there are enough code units.
1515 LimitIter limit_;
1516 // Keep state so that we call readAndInc() only once for both operator*() and ++
1517 // so that we can use a single-pass input iterator for UnitIter.
1518 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1519 // true: units_ = readAndInc(), p_ = units limit
1520 // which means that p_ is ahead of its logical position
1521 // false: initial state
1522 mutable bool ahead_ = false;
1523};
1524#endif // U_IN_DOXYGEN
1525
1526} // namespace U_HEADER_ONLY_NAMESPACE
1527
1528#ifndef U_IN_DOXYGEN
1529// Bespoke specialization of reverse_iterator.
1530// The default implementation implements reverse operator*() and ++ in a way
1531// that does most of the same work twice for reading variable-length sequences.
1532template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1533class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1534 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1535 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1536 using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>;
1537
1538 // Proxy type for operator->() (required by LegacyInputIterator)
1539 // so that we don't promise always returning CodeUnits.
1540 class Proxy {
1541 public:
1542 explicit Proxy(CodeUnits_ units) : units_(units) {}
1543 CodeUnits_ &operator*() { return units_; }
1544 CodeUnits_ *operator->() { return &units_; }
1545 private:
1546 CodeUnits_ units_;
1547 };
1548
1549public:
1550 using value_type = CodeUnits_;
1551 using reference = value_type;
1552 using pointer = Proxy;
1553 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
1554 using iterator_category = std::bidirectional_iterator_tag;
1555
1556 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) :
1557 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1558 units_(0, 0, false, p_, p_) {}
1559 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1560
1561 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1562 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1563
1564 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1565 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1566
1567 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1568 return getLogicalPosition() == other.getLogicalPosition();
1569 }
1570 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1571
1572 U_FORCE_INLINE CodeUnits_ operator*() const {
1573 if (state_ == 0) {
1574 units_ = Impl::decAndRead(start_, p_);
1575 state_ = -1;
1576 }
1577 return units_;
1578 }
1579
1580 U_FORCE_INLINE Proxy operator->() const {
1581 if (state_ == 0) {
1582 units_ = Impl::decAndRead(start_, p_);
1583 state_ = -1;
1584 }
1585 return Proxy(units_);
1586 }
1587
1588 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1589 if (state_ < 0) {
1590 // operator*() called decAndRead() so p_ is already behind.
1591 state_ = 0;
1592 } else if (state_ == 0) {
1593 Impl::dec(start_, p_);
1594 } else /* state_ > 0 */ {
1595 // operator--() called readAndInc() so we know how far to skip.
1596 p_ = units_.begin();
1597 state_ = 0;
1598 }
1599 return *this;
1600 }
1601
1602 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1603 if (state_ < 0) {
1604 // operator*() called decAndRead() so p_ is already behind.
1605 reverse_iterator result(*this);
1606 state_ = 0;
1607 return result;
1608 } else if (state_ == 0) {
1609 units_ = Impl::decAndRead(start_, p_);
1610 reverse_iterator result(*this);
1611 result.state_ = -1;
1612 // keep this->state_ == 0
1613 return result;
1614 } else /* state_ > 0 */ {
1615 reverse_iterator result(*this);
1616 // operator--() called readAndInc() so we know how far to skip.
1617 p_ = units_.begin();
1618 state_ = 0;
1619 return result;
1620 }
1621 }
1622
1623 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1624 if (state_ < 0) {
1625 // operator*() called decAndRead() so p_ is behind the logical position.
1626 p_ = units_.end();
1627 }
1628 UnitIter p0 = p_;
1629 units_ = Impl::readAndInc(p0, p_, limit_);
1630 state_ = 1;
1631 return *this;
1632 }
1633
1634 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1635 reverse_iterator result(*this);
1636 operator--();
1637 return result;
1638 }
1639
1640private:
1641 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1642 return state_ >= 0 ? p_ : units_.end();
1643 }
1644
1645 // operator*() etc. are logically const.
1646 mutable UnitIter p_;
1647 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1648 // (forward or backward) we can test if there are enough code units.
1649 UnitIter start_;
1650 UnitIter limit_;
1651 // Keep state so that we call decAndRead() only once for both operator*() and ++
1652 // to make it easy for the compiler to optimize.
1653 mutable CodeUnits_ units_;
1654 // >0: units_ = readAndInc(), p_ = units limit
1655 // 0: initial state
1656 // <0: units_ = decAndRead(), p_ = units start
1657 // which means that p_ is behind its logical position
1658 mutable int8_t state_ = 0;
1659};
1660#endif // U_IN_DOXYGEN
1661
1662namespace U_HEADER_ONLY_NAMESPACE {
1663
1686template<typename CP32, UTFIllFormedBehavior behavior,
1687 typename UnitIter, typename LimitIter = UnitIter>
1688auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1690 std::move(start), std::move(p), std::move(limit));
1691}
1692
1713template<typename CP32, UTFIllFormedBehavior behavior,
1714 typename UnitIter, typename LimitIter = UnitIter>
1715auto utfIterator(UnitIter p, LimitIter limit) {
1717 std::move(p), std::move(limit));
1718}
1719
1720// Note: We should only enable the following factory function for a copyable UnitIter.
1721// In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1722// but a function template partial specialization is not allowed.
1723// In C++20, we might be able to require the std::copyable concept.
1724
1744template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1745auto utfIterator(UnitIter p) {
1746 return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1747}
1748
1776template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1778 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1779public:
1785
1791 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1792 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1801 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1802 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1803
1806
1809
1814 auto begin() {
1815 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1816 }
1817
1822 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1823 auto begin() const {
1824 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1825 }
1826
1831 auto end() {
1832 using UnitIter = decltype(unitRange.begin());
1833 using LimitIter = decltype(unitRange.end());
1834 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1835 // Return the code unit sentinel.
1836 return unitRange.end();
1837 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1838 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1839 } else {
1840 // The input iterator specialization has no three-argument constructor.
1841 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1842 }
1843 }
1844
1849 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1850 auto end() const {
1851 using UnitIter = decltype(unitRange.begin());
1852 using LimitIter = decltype(unitRange.end());
1853 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1854 // Return the code unit sentinel.
1855 return unitRange.end();
1856 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1857 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1858 } else {
1859 // The input iterator specialization has no three-argument constructor.
1860 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1861 }
1862 }
1863
1868 auto rbegin() const {
1869 return std::make_reverse_iterator(end());
1870 }
1871
1876 auto rend() const {
1877 return std::make_reverse_iterator(begin());
1878 }
1879
1880private:
1881 Range unitRange;
1882};
1883
1885template<typename CP32, UTFIllFormedBehavior behavior>
1887#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1888 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1889 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1890#endif
1891{
1893 template<typename Range>
1894 auto operator()(Range &&unitRange) const {
1895#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1897 std::forward<Range>(unitRange));
1898#else
1900 // Take basic_string_view by copy, not by reference. In C++20 this is handled by
1901 // all_t<Range>, which is Range if Range is a view.
1903 std::forward<Range>(unitRange));
1904 } else {
1905 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1906 }
1907#endif
1908 }
1909};
1910
1925template<typename CP32, UTFIllFormedBehavior behavior>
1927
1928// Non-validating iterators ------------------------------------------------ ***
1929
1951template<typename CP32, typename UnitIter, typename = void>
1953 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1954 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1955
1956 // Proxy type for operator->() (required by LegacyInputIterator)
1957 // so that we don't promise always returning UnsafeCodeUnits.
1958 class Proxy {
1959 public:
1960 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1961 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1962 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1963 private:
1965 };
1966
1967public:
1973 using pointer = Proxy;
1977 using iterator_category = std::conditional_t<
1979 std::bidirectional_iterator_tag,
1980 std::forward_iterator_tag>;
1981
1991 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
1997 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
1998
2003
2008
2015 return getLogicalPosition() == other.getLogicalPosition();
2016 }
2017
2022 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2023
2030 template<typename Sentinel> U_FORCE_INLINE friend
2031 std::enable_if_t<
2032 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2033 bool>
2034 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2035 return iter.getLogicalPosition() == s;
2036 }
2037
2038#if U_CPLUSPLUS_VERSION < 20
2045 template<typename Sentinel> U_FORCE_INLINE friend
2046 std::enable_if_t<
2047 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2048 bool>
2049 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2050 return iter.getLogicalPosition() == s;
2051 }
2052
2058 template<typename Sentinel> U_FORCE_INLINE friend
2059 std::enable_if_t<
2060 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2061 bool>
2062 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2069 template<typename Sentinel> U_FORCE_INLINE friend
2070 std::enable_if_t<
2071 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2072 bool>
2073 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2074#endif // C++17
2075
2083 if (state_ == 0) {
2084 UnitIter p0 = p_;
2085 units_ = Impl::readAndInc(p0, p_);
2086 state_ = 1;
2087 }
2088 return units_;
2089 }
2090
2100 if (state_ == 0) {
2101 UnitIter p0 = p_;
2102 units_ = Impl::readAndInc(p0, p_);
2103 state_ = 1;
2104 }
2105 return Proxy(units_);
2106 }
2107
2115 if (state_ > 0) {
2116 // operator*() called readAndInc() so p_ is already ahead.
2117 state_ = 0;
2118 } else if (state_ == 0) {
2119 Impl::inc(p_);
2120 } else /* state_ < 0 */ {
2121 // operator--() called decAndRead() so we know how far to skip.
2122 p_ = units_.end();
2123 state_ = 0;
2124 }
2125 return *this;
2126 }
2127
2137 if (state_ > 0) {
2138 // operator*() called readAndInc() so p_ is already ahead.
2139 UnsafeUTFIterator result(*this);
2140 state_ = 0;
2141 return result;
2142 } else if (state_ == 0) {
2143 UnitIter p0 = p_;
2144 units_ = Impl::readAndInc(p0, p_);
2145 UnsafeUTFIterator result(*this);
2146 result.state_ = 1;
2147 // keep this->state_ == 0
2148 return result;
2149 } else /* state_ < 0 */ {
2150 UnsafeUTFIterator result(*this);
2151 // operator--() called decAndRead() so we know how far to skip.
2152 p_ = units_.end();
2153 state_ = 0;
2154 return result;
2155 }
2156 }
2157
2165 template<typename Iter = UnitIter>
2167 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2168 operator--() { // pre-decrement
2169 if (state_ > 0) {
2170 // operator*() called readAndInc() so p_ is ahead of the logical position.
2171 p_ = units_.begin();
2172 }
2173 units_ = Impl::decAndRead(p_);
2174 state_ = -1;
2175 return *this;
2176 }
2177
2185 template<typename Iter = UnitIter>
2187 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2188 operator--(int) { // post-decrement
2189 UnsafeUTFIterator result(*this);
2190 operator--();
2191 return result;
2192 }
2193
2194private:
2195 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2196
2197 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2198 return state_ <= 0 ? p_ : units_.begin();
2199 }
2200
2201 // operator*() etc. are logically const.
2202 mutable UnitIter p_;
2203 // Keep state so that we call readAndInc() only once for both operator*() and ++
2204 // to make it easy for the compiler to optimize.
2205 mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2206 // >0: units_ = readAndInc(), p_ = units limit
2207 // which means that p_ is ahead of its logical position
2208 // 0: initial state
2209 // <0: units_ = decAndRead(), p_ = units start
2210 mutable int8_t state_ = 0;
2211};
2212
2213#ifndef U_IN_DOXYGEN
2214// Partial template specialization for single-pass input iterator.
2215template<typename CP32, typename UnitIter>
2216class UnsafeUTFIterator<
2217 CP32,
2218 UnitIter,
2219 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2220 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2221 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2222
2223 // Proxy type for post-increment return value, to make *iter++ work.
2224 // Also for operator->() (required by LegacyInputIterator)
2225 // so that we don't promise always returning UnsafeCodeUnits.
2226 class Proxy {
2227 public:
2228 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2229 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2230 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2231 private:
2232 UnsafeCodeUnits<CP32, UnitIter> units_;
2233 };
2234
2235public:
2236 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2237 using reference = value_type;
2238 using pointer = Proxy;
2239 using difference_type = prv::iter_difference_t<UnitIter>;
2240 using iterator_category = std::input_iterator_tag;
2241
2242 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2243
2244 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2246
2247 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2249
2250 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2251 return p_ == other.p_ && ahead_ == other.ahead_;
2252 // Strictly speaking, we should check if the logical position is the same.
2253 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2254 }
2255 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2256
2257 template<typename Sentinel> U_FORCE_INLINE friend
2258 std::enable_if_t<
2259 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2260 bool>
2261 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2262 return !iter.ahead_ && iter.p_ == s;
2263 }
2264
2265#if U_CPLUSPLUS_VERSION < 20
2266 template<typename Sentinel> U_FORCE_INLINE friend
2267 std::enable_if_t<
2268 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2269 bool>
2270 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2271 return !iter.ahead_ && iter.p_ == s;
2272 }
2273
2274 template<typename Sentinel> U_FORCE_INLINE friend
2275 std::enable_if_t<
2276 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2277 bool>
2278 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2279
2280 template<typename Sentinel> U_FORCE_INLINE friend
2281 std::enable_if_t<
2282 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2283 bool>
2284 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2285#endif // C++17
2286
2287 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2288 if (!ahead_) {
2289 units_ = Impl::readAndInc(p_, p_);
2290 ahead_ = true;
2291 }
2292 return units_;
2293 }
2294
2295 U_FORCE_INLINE Proxy operator->() const {
2296 if (!ahead_) {
2297 units_ = Impl::readAndInc(p_, p_);
2298 ahead_ = true;
2299 }
2300 return Proxy(units_);
2301 }
2302
2303 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2304 if (ahead_) {
2305 // operator*() called readAndInc() so p_ is already ahead.
2306 ahead_ = false;
2307 } else {
2308 Impl::inc(p_);
2309 }
2310 return *this;
2311 }
2312
2313 U_FORCE_INLINE Proxy operator++(int) { // post-increment
2314 if (ahead_) {
2315 // operator*() called readAndInc() so p_ is already ahead.
2316 ahead_ = false;
2317 } else {
2318 units_ = Impl::readAndInc(p_, p_);
2319 // keep this->ahead_ == false
2320 }
2321 return Proxy(units_);
2322 }
2323
2324private:
2325 // operator*() etc. are logically const.
2326 mutable UnitIter p_;
2327 // Keep state so that we call readAndInc() only once for both operator*() and ++
2328 // so that we can use a single-pass input iterator for UnitIter.
2329 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2330 // true: units_ = readAndInc(), p_ = units limit
2331 // which means that p_ is ahead of its logical position
2332 // false: initial state
2333 mutable bool ahead_ = false;
2334};
2335#endif // U_IN_DOXYGEN
2336
2337} // namespace U_HEADER_ONLY_NAMESPACE
2338
2339#ifndef U_IN_DOXYGEN
2340// Bespoke specialization of reverse_iterator.
2341// The default implementation implements reverse operator*() and ++ in a way
2342// that does most of the same work twice for reading variable-length sequences.
2343template<typename CP32, typename UnitIter>
2344class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2345 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2346 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2347 using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>;
2348
2349 // Proxy type for operator->() (required by LegacyInputIterator)
2350 // so that we don't promise always returning UnsafeCodeUnits.
2351 class Proxy {
2352 public:
2353 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2354 UnsafeCodeUnits_ &operator*() { return units_; }
2355 UnsafeCodeUnits_ *operator->() { return &units_; }
2356 private:
2357 UnsafeCodeUnits_ units_;
2358 };
2359
2360public:
2361 using value_type = UnsafeCodeUnits_;
2362 using reference = value_type;
2363 using pointer = Proxy;
2364 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
2365 using iterator_category = std::bidirectional_iterator_tag;
2366
2367 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2368 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2369 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2370
2371 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2372 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2373
2374 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2375 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2376
2377 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2378 return getLogicalPosition() == other.getLogicalPosition();
2379 }
2380 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2381
2382 U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2383 if (state_ == 0) {
2384 units_ = Impl::decAndRead(p_);
2385 state_ = -1;
2386 }
2387 return units_;
2388 }
2389
2390 U_FORCE_INLINE Proxy operator->() const {
2391 if (state_ == 0) {
2392 units_ = Impl::decAndRead(p_);
2393 state_ = -1;
2394 }
2395 return Proxy(units_);
2396 }
2397
2398 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2399 if (state_ < 0) {
2400 // operator*() called decAndRead() so p_ is already behind.
2401 state_ = 0;
2402 } else if (state_ == 0) {
2403 Impl::dec(p_);
2404 } else /* state_ > 0 */ {
2405 // operator--() called readAndInc() so we know how far to skip.
2406 p_ = units_.begin();
2407 state_ = 0;
2408 }
2409 return *this;
2410 }
2411
2412 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2413 if (state_ < 0) {
2414 // operator*() called decAndRead() so p_ is already behind.
2415 reverse_iterator result(*this);
2416 state_ = 0;
2417 return result;
2418 } else if (state_ == 0) {
2419 units_ = Impl::decAndRead(p_);
2420 reverse_iterator result(*this);
2421 result.state_ = -1;
2422 // keep this->state_ == 0
2423 return result;
2424 } else /* state_ > 0 */ {
2425 reverse_iterator result(*this);
2426 // operator--() called readAndInc() so we know how far to skip.
2427 p_ = units_.begin();
2428 state_ = 0;
2429 return result;
2430 }
2431 }
2432
2433 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2434 if (state_ < 0) {
2435 // operator*() called decAndRead() so p_ is behind the logical position.
2436 p_ = units_.end();
2437 }
2438 UnitIter p0 = p_;
2439 units_ = Impl::readAndInc(p0, p_);
2440 state_ = 1;
2441 return *this;
2442 }
2443
2444 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2445 reverse_iterator result(*this);
2446 operator--();
2447 return result;
2448 }
2449
2450private:
2451 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2452 return state_ >= 0 ? p_ : units_.end();
2453 }
2454
2455 // operator*() etc. are logically const.
2456 mutable UnitIter p_;
2457 // Keep state so that we call decAndRead() only once for both operator*() and ++
2458 // to make it easy for the compiler to optimize.
2459 mutable UnsafeCodeUnits_ units_;
2460 // >0: units_ = readAndInc(), p_ = units limit
2461 // 0: initial state
2462 // <0: units_ = decAndRead(), p_ = units start
2463 // which means that p_ is behind its logical position
2464 mutable int8_t state_ = 0;
2465};
2466#endif // U_IN_DOXYGEN
2467
2468namespace U_HEADER_ONLY_NAMESPACE {
2469
2485template<typename CP32, typename UnitIter>
2486auto unsafeUTFIterator(UnitIter iter) {
2487 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2488}
2489
2517template<typename CP32, typename Range>
2519 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2520public:
2526
2532 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2533 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2542 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2543 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2544
2547
2550
2555 auto begin() {
2556 return unsafeUTFIterator<CP32>(unitRange.begin());
2557 }
2558
2563 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2564 auto begin() const {
2565 return unsafeUTFIterator<CP32>(unitRange.begin());
2566 }
2567
2572 auto end() {
2573 using UnitIter = decltype(unitRange.begin());
2574 using LimitIter = decltype(unitRange.end());
2575 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2576 // Return the code unit sentinel.
2577 return unitRange.end();
2578 } else {
2579 return unsafeUTFIterator<CP32>(unitRange.end());
2580 }
2581 }
2582
2587 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2588 auto end() const {
2589 using UnitIter = decltype(unitRange.begin());
2590 using LimitIter = decltype(unitRange.end());
2591 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2592 // Return the code unit sentinel.
2593 return unitRange.end();
2594 } else {
2595 return unsafeUTFIterator<CP32>(unitRange.end());
2596 }
2597 }
2598
2603 auto rbegin() const {
2604 return std::make_reverse_iterator(end());
2605 }
2606
2611 auto rend() const {
2612 return std::make_reverse_iterator(begin());
2613 }
2614
2615private:
2616 Range unitRange;
2617};
2618
2620template<typename CP32>
2622#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2623 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2624 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2625#endif
2626{
2628 template<typename Range>
2629 auto operator()(Range &&unitRange) const {
2630#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2631 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2632#else
2634 // Take basic_string_view by copy, not by reference. In C++20 this is handled by
2635 // all_t<Range>, which is Range if Range is a view.
2636 return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2637 } else {
2638 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2639 }
2640#endif
2641 }
2642};
2643
2644
2657template<typename CP32>
2659
2660} // namespace U_HEADER_ONLY_NAMESPACE
2661
2662
2663#if defined(__cpp_lib_ranges)
2664template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
2665constexpr bool std::ranges::enable_borrowed_range<
2667 std::ranges::enable_borrowed_range<Range>;
2668
2669template <typename CP32, typename Range>
2670constexpr bool std::ranges::enable_borrowed_range<
2672 std::ranges::enable_borrowed_range<Range>;
2673#endif
2674
2675#endif // U_HIDE_DRAFT_API
2676#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2677#endif // __UTFITERATOR_H__
Result of validating and decoding a code unit sequence for one code point.
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Validating iterator over the code points in a Unicode string.
U_FORCE_INLINE UTFIterator()
Default constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
value_type reference
C++ iterator boilerplate.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Proxy pointer
C++ iterator boilerplate.
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
A C++ "range" for validating iteration over all of the code points of a code unit range.
UTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UTFStringCodePoints & operator=(const UTFStringCodePoints &other)=default
Copy assignment operator.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
UTFStringCodePoints(const UTFStringCodePoints &other)=default
Copy constructor.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Result of decoding a code unit sequence for one code point.
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
Non-validating iterator over the code points in a Unicode string.
U_FORCE_INLINE UnsafeUTFIterator()
Default constructor.
UnsafeCodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UnsafeUTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator > operator--(int)
Post-decrement operator.
value_type reference
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UnsafeUTFIterator &iter)
U_FORCE_INLINE UnsafeUTFIterator & operator++()
Pre-increment operator.
Proxy pointer
C++ iterator boilerplate.
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UnsafeUTFIterator &iter)
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE UnsafeUTFIterator operator++(int)
Post-increment operator.
U_FORCE_INLINE UnsafeUTFIterator & operator=(UnsafeUTFIterator &&src) noexcept=default
Move assignment operator.
U_FORCE_INLINE UnsafeCodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UnsafeUTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UnsafeUTFIterator & operator=(const UnsafeUTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE UnsafeUTFIterator(UnitIter p)
Constructor; the iterator/pointer should be at a code point boundary.
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator & > operator--()
Pre-decrement operator.
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const
A C++ "range" for non-validating iteration over all of the code points of a code unit range.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other)=default
Copy constructor.
UnsafeUTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UnsafeUTFStringCodePoints & operator=(const UnsafeUTFStringCodePoints &other)=default
Copy assignment operator.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
int32_t difference_type
C++ iterator boilerplate.
bool operator==(const CodePointsIterator &other) const
bool operator!=(const CodePointsIterator &other) const
value_type reference
C++ iterator boilerplate.
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
CP32 * pointer
C++ iterator boilerplate.
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ... if C++.
Definition platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte?
Definition utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte?
Definition utf8.h:190
auto unsafeUTFIterator(UnitIter iter)
UnsafeUTFIterator factory function.
typename std::iterator_traits< Iter >::value_type iter_value_t
constexpr bool is_basic_string_view_v
constexpr bool forward_iterator
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit)
UTFIterator factory function for start <= p < limit.
constexpr UTFStringCodePointsAdaptor< CP32, behavior > utfStringCodePoints
Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of cod...
typename std::iterator_traits< Iter >::difference_type iter_difference_t
constexpr bool bidirectional_iterator
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > unsafeUTFStringCodePoints
Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range"...
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.