ICU 78.3 78.3
Loading...
Searching...
No Matches
utfiterator.h
Go to the documentation of this file.
1// © 2024 and later: Unicode, Inc. and others.
2// License & terms of use: https://www.unicode.org/copyright.html
3
4// utfiterator.h
5// created: 2024aug12 Markus W. Scherer
6
7#ifndef __UTFITERATOR_H__
8#define __UTFITERATOR_H__
9
10#include "unicode/utypes.h"
11
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13
14#include <iterator>
15#if defined(__cpp_lib_ranges)
16#include <ranges>
17#endif
18#include <string>
19#include <string_view>
20#include <type_traits>
21#include "unicode/utf16.h"
22#include "unicode/utf8.h"
23#include "unicode/uversion.h"
24
134
135#ifndef U_HIDE_DRAFT_API
136
169
170namespace U_HEADER_ONLY_NAMESPACE {
171
172namespace prv {
173#if U_CPLUSPLUS_VERSION >= 20
174
176template<typename Iter>
177using iter_value_t = typename std::iter_value_t<Iter>;
178
180template<typename Iter>
181using iter_difference_t = std::iter_difference_t<Iter>;
182
184template<typename Iter>
185constexpr bool forward_iterator = std::forward_iterator<Iter>;
186
188template<typename Iter>
189constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190
192template<typename Range>
193constexpr bool range = std::ranges::range<Range>;
194
195#else
196
198template<typename Iter>
199using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200
202template<typename Iter>
203using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204
206template<typename Iter>
207constexpr bool forward_iterator =
208 std::is_base_of_v<
209 std::forward_iterator_tag,
210 typename std::iterator_traits<Iter>::iterator_category>;
211
213template<typename Iter>
215 std::is_base_of_v<
216 std::bidirectional_iterator_tag,
217 typename std::iterator_traits<Iter>::iterator_category>;
218
220template<typename Range, typename = void>
221struct range_type : std::false_type {};
222
224template<typename Range>
226 Range,
227 std::void_t<decltype(std::declval<Range>().begin()),
228 decltype(std::declval<Range>().end())>> : std::true_type {};
229
231template<typename Range>
233
234#endif
235
237template <typename T> struct is_basic_string_view : std::false_type {};
238
240template <typename... Args>
241struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
242
244template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
245
247template<typename CP32, bool skipSurrogates>
249 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
250public:
252 using value_type = CP32;
256 using pointer = CP32 *;
258 using difference_type = int32_t;
260 using iterator_category = std::forward_iterator_tag;
261
263 inline CodePointsIterator(CP32 c) : c_(c) {}
265 inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
267 inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
269 inline CP32 operator*() const { return c_; }
271 inline CodePointsIterator &operator++() { // pre-increment
272 ++c_;
273 if (skipSurrogates && c_ == 0xd800) {
274 c_ = 0xe000;
275 }
276 return *this;
277 }
278
279 inline CodePointsIterator operator++(int) { // post-increment
280 CodePointsIterator result(*this);
281 ++(*this);
282 return result;
283 }
284
285private:
286 CP32 c_;
287};
288
289} // namespace prv
290
301template<typename CP32>
303 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
304public:
312 auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
317 auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
318};
319
332template<typename CP32>
334 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
335public:
343 auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
348 auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
349};
350
366template<typename CP32, typename UnitIter, typename = void>
368 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
369 using Unit = typename prv::iter_value_t<UnitIter>;
370public:
372 UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
373 c_(codePoint), len_(length), start_(start), limit_(limit) {}
374
376 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
378 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
379
387 CP32 codePoint() const { return c_; }
388
394 UnitIter begin() const { return start_; }
395
401 UnitIter end() const { return limit_; }
402
407 uint8_t length() const { return len_; }
408
409#if U_CPLUSPLUS_VERSION >= 20
415 template<std::contiguous_iterator Iter = UnitIter>
416 std::basic_string_view<Unit> stringView() const {
417 return std::basic_string_view<Unit>(begin(), end());
418 }
419#else
425 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
426 std::enable_if_t<std::is_pointer_v<Iter> ||
427 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
428 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
429 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
430 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
431 std::basic_string_view<Unit>>
432 stringView() const {
433 return std::basic_string_view<Unit>(&*start_, len_);
434 }
435#endif
436
437private:
438 // Order of fields with padding and access frequency in mind.
439 CP32 c_;
440 uint8_t len_;
441 UnitIter start_;
442 UnitIter limit_;
443};
444
445#ifndef U_IN_DOXYGEN
446// Partial template specialization for single-pass input iterator.
447// No UnitIter field, no getter for it, no stringView().
448template<typename CP32, typename UnitIter>
449class UnsafeCodeUnits<
450 CP32,
451 UnitIter,
452 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
453 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
454public:
455 UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
456
457 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
458 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
459
460 CP32 codePoint() const { return c_; }
461
462 uint8_t length() const { return len_; }
463
464private:
465 // Order of fields with padding and access frequency in mind.
466 CP32 c_;
467 uint8_t len_;
468};
469#endif // U_IN_DOXYGEN
470
486template<typename CP32, typename UnitIter, typename = void>
487class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
488public:
490 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
491 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
492
494 CodeUnits(const CodeUnits &other) = default;
496 CodeUnits &operator=(const CodeUnits &other) = default;
497
502 bool wellFormed() const { return ok_; }
503
504private:
505 bool ok_;
506};
507
508#ifndef U_IN_DOXYGEN
509// Partial template specialization for single-pass input iterator.
510// No UnitIter field, no getter for it, no stringView().
511template<typename CP32, typename UnitIter>
512class CodeUnits<
513 CP32,
514 UnitIter,
515 std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
516 public UnsafeCodeUnits<CP32, UnitIter> {
517public:
518 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
519 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
520
521 CodeUnits(const CodeUnits &other) = default;
522 CodeUnits &operator=(const CodeUnits &other) = default;
523
524 bool wellFormed() const { return ok_; }
525
526private:
527 bool ok_;
528};
529#endif // U_IN_DOXYGEN
530
531// Validating implementations ---------------------------------------------- ***
532
533#ifndef U_IN_DOXYGEN
534template<typename CP32, UTFIllFormedBehavior behavior,
535 typename UnitIter, typename LimitIter = UnitIter, typename = void>
536class UTFImpl;
537
538// Note: readAndInc() functions take both a p0 and a p iterator.
539// They must have the same value.
540// For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
541// and readAndInc() copies p0 and the incremented p into the CodeUnits.
542// For a single-pass UnitIter, which may not be default-constructible nor coypable,
543// the caller can pass p into both references, and readAndInc() does not use p0
544// and constructs CodeUnits without them.
545// Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
546// which may not be possible for a single-pass iterator.
547
548// UTF-8
549template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
550class UTFImpl<
551 CP32, behavior,
552 UnitIter, LimitIter,
553 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
554 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
555 static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
556 "For 8-bit strings, the SURROGATE option does not have an equivalent.");
557public:
558 // Handle ill-formed UTF-8
559 U_FORCE_INLINE static CP32 sub() {
560 if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
561 return U_SENTINEL;
562 } else {
563 static_assert(behavior == UTF_BEHAVIOR_FFFD);
564 return 0xfffd;
565 }
566 }
567
568 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
569 // Very similar to U8_FWD_1().
570 uint8_t b = *p;
571 ++p;
572 if (U8_IS_LEAD(b) && p != limit) {
573 uint8_t t1 = *p;
574 if ((0xe0 <= b && b < 0xf0)) {
575 if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
576 ++p != limit && U8_IS_TRAIL(*p)) {
577 ++p;
578 }
579 } else if (b < 0xe0) {
580 if (U8_IS_TRAIL(t1)) {
581 ++p;
582 }
583 } else /* b >= 0xf0 */ {
584 if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
585 ++p != limit && U8_IS_TRAIL(*p) &&
586 ++p != limit && U8_IS_TRAIL(*p)) {
587 ++p;
588 }
589 }
590 }
591 }
592
593 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
594 // Very similar to U8_BACK_1().
595 uint8_t c = *--p;
596 if (U8_IS_TRAIL(c) && p != start) {
597 UnitIter p1 = p;
598 uint8_t b1 = *--p1;
599 if (U8_IS_LEAD(b1)) {
600 if (b1 < 0xe0 ||
601 (b1 < 0xf0 ?
603 U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
604 p = p1;
605 return;
606 }
607 } else if (U8_IS_TRAIL(b1) && p1 != start) {
608 uint8_t b2 = *--p1;
609 if (0xe0 <= b2 && b2 <= 0xf4) {
610 if (b2 < 0xf0 ?
612 U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
613 p = p1;
614 return;
615 }
616 } else if (U8_IS_TRAIL(b2) && p1 != start) {
617 uint8_t b3 = *--p1;
618 if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
619 p = p1;
620 return;
621 }
622 }
623 }
624 }
625 }
626
627 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
628 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
629 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
630 // Very similar to U8_NEXT_OR_FFFD().
631 CP32 c = uint8_t(*p);
632 ++p;
633 if (U8_IS_SINGLE(c)) {
634 if constexpr (isMultiPass) {
635 return {c, 1, true, p0, p};
636 } else {
637 return {c, 1, true};
638 }
639 }
640 uint8_t length = 1;
641 uint8_t t = 0;
642 if (p != limit &&
643 // fetch/validate/assemble all but last trail byte
644 (c >= 0xe0 ?
645 (c < 0xf0 ? // U+0800..U+FFFF except surrogates
646 U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
647 (t &= 0x3f, 1)
648 : // U+10000..U+10FFFF
649 (c -= 0xf0) <= 4 &&
650 U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
651 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
652 (t = *p - 0x80) <= 0x3f) &&
653 // valid second-to-last trail byte
654 (c = (c << 6) | t, ++length, ++p != limit)
655 : // U+0080..U+07FF
656 c >= 0xc2 && (c &= 0x1f, 1)) &&
657 // last trail byte
658 (t = *p - 0x80) <= 0x3f) {
659 c = (c << 6) | t;
660 ++length;
661 ++p;
662 if constexpr (isMultiPass) {
663 return {c, length, true, p0, p};
664 } else {
665 return {c, length, true};
666 }
667 }
668 if constexpr (isMultiPass) {
669 return {sub(), length, false, p0, p};
670 } else {
671 return {sub(), length, false};
672 }
673 }
674
675 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
676 // Very similar to U8_PREV_OR_FFFD().
677 UnitIter p0 = p;
678 CP32 c = uint8_t(*--p);
679 if (U8_IS_SINGLE(c)) {
680 return {c, 1, true, p, p0};
681 }
682 if (U8_IS_TRAIL(c) && p != start) {
683 UnitIter p1 = p;
684 uint8_t b1 = *--p1;
685 if (U8_IS_LEAD(b1)) {
686 if (b1 < 0xe0) {
687 p = p1;
688 c = ((b1 - 0xc0) << 6) | (c & 0x3f);
689 return {c, 2, true, p, p0};
690 } else if (b1 < 0xf0 ?
693 // Truncated 3- or 4-byte sequence.
694 p = p1;
695 return {sub(), 2, false, p, p0};
696 }
697 } else if (U8_IS_TRAIL(b1) && p1 != start) {
698 // Extract the value bits from the last trail byte.
699 c &= 0x3f;
700 uint8_t b2 = *--p1;
701 if (0xe0 <= b2 && b2 <= 0xf4) {
702 if (b2 < 0xf0) {
703 b2 &= 0xf;
704 if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
705 p = p1;
706 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
707 return {c, 3, true, p, p0};
708 }
709 } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
710 // Truncated 4-byte sequence.
711 p = p1;
712 return {sub(), 3, false, p, p0};
713 }
714 } else if (U8_IS_TRAIL(b2) && p1 != start) {
715 uint8_t b3 = *--p1;
716 if (0xf0 <= b3 && b3 <= 0xf4) {
717 b3 &= 7;
718 if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
719 p = p1;
720 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
721 return {c, 4, true, p, p0};
722 }
723 }
724 }
725 }
726 }
727 return {sub(), 1, false, p, p0};
728 }
729};
730
731// UTF-16
732template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
733class UTFImpl<
734 CP32, behavior,
735 UnitIter, LimitIter,
736 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
737 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
738public:
739 // Handle ill-formed UTF-16: One unpaired surrogate.
740 U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
741 if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
742 return U_SENTINEL;
743 } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
744 return 0xfffd;
745 } else {
746 static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
747 return surrogate;
748 }
749 }
750
751 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
752 // Very similar to U16_FWD_1().
753 auto c = *p;
754 ++p;
755 if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
756 ++p;
757 }
758 }
759
760 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
761 // Very similar to U16_BACK_1().
762 UnitIter p1;
763 if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
764 p = p1;
765 }
766 }
767
768 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
769 UnitIter &p0, UnitIter &p, const LimitIter &limit) {
770 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
771 // Very similar to U16_NEXT_OR_FFFD().
772 CP32 c = static_cast<CP32>(*p);
773 ++p;
774 if (!U16_IS_SURROGATE(c)) {
775 if constexpr (isMultiPass) {
776 return {c, 1, true, p0, p};
777 } else {
778 return {c, 1, true};
779 }
780 } else {
781 uint16_t c2;
782 if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
783 ++p;
784 c = U16_GET_SUPPLEMENTARY(c, c2);
785 if constexpr (isMultiPass) {
786 return {c, 2, true, p0, p};
787 } else {
788 return {c, 2, true};
789 }
790 } else {
791 if constexpr (isMultiPass) {
792 return {sub(c), 1, false, p0, p};
793 } else {
794 return {sub(c), 1, false};
795 }
796 }
797 }
798 }
799
800 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
801 // Very similar to U16_PREV_OR_FFFD().
802 UnitIter p0 = p;
803 CP32 c = static_cast<CP32>(*--p);
804 if (!U16_IS_SURROGATE(c)) {
805 return {c, 1, true, p, p0};
806 } else {
807 UnitIter p1;
808 uint16_t c2;
809 if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
810 p = p1;
811 c = U16_GET_SUPPLEMENTARY(c2, c);
812 return {c, 2, true, p, p0};
813 } else {
814 return {sub(c), 1, false, p, p0};
815 }
816 }
817 }
818};
819
820// UTF-32: trivial, but still validating
821template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
822class UTFImpl<
823 CP32, behavior,
824 UnitIter, LimitIter,
825 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
826 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
827public:
828 // Handle ill-formed UTF-32
829 U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
830 if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
831 return U_SENTINEL;
832 } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
833 return 0xfffd;
834 } else {
835 static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
836 return forSurrogate ? surrogate : 0xfffd;
837 }
838 }
839
840 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
841 ++p;
842 }
843
844 U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
845 --p;
846 }
847
848 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
849 UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
850 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
851 uint32_t uc = *p;
852 CP32 c = uc;
853 ++p;
854 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
855 if constexpr (isMultiPass) {
856 return {c, 1, true, p0, p};
857 } else {
858 return {c, 1, true};
859 }
860 } else {
861 if constexpr (isMultiPass) {
862 return {sub(uc < 0xe000, c), 1, false, p0, p};
863 } else {
864 return {sub(uc < 0xe000, c), 1, false};
865 }
866 }
867 }
868
869 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
870 UnitIter p0 = p;
871 uint32_t uc = *--p;
872 CP32 c = uc;
873 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
874 return {c, 1, true, p, p0};
875 } else {
876 return {sub(uc < 0xe000, c), 1, false, p, p0};
877 }
878 }
879};
880
881// Non-validating implementations ------------------------------------------ ***
882
883template<typename CP32, typename UnitIter, typename = void>
884class UnsafeUTFImpl;
885
886// UTF-8
887template<typename CP32, typename UnitIter>
888class UnsafeUTFImpl<
889 CP32,
890 UnitIter,
891 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
892 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
893public:
894 U_FORCE_INLINE static void inc(UnitIter &p) {
895 // Very similar to U8_FWD_1_UNSAFE().
896 uint8_t b = *p;
897 std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
898 }
899
900 U_FORCE_INLINE static void dec(UnitIter &p) {
901 // Very similar to U8_BACK_1_UNSAFE().
902 while (U8_IS_TRAIL(*--p)) {}
903 }
904
905 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
906 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
907 // Very similar to U8_NEXT_UNSAFE().
908 CP32 c = uint8_t(*p);
909 ++p;
910 if (U8_IS_SINGLE(c)) {
911 if constexpr (isMultiPass) {
912 return {c, 1, p0, p};
913 } else {
914 return {c, 1};
915 }
916 } else if (c < 0xe0) {
917 c = ((c & 0x1f) << 6) | (*p & 0x3f);
918 ++p;
919 if constexpr (isMultiPass) {
920 return {c, 2, p0, p};
921 } else {
922 return {c, 2};
923 }
924 } else if (c < 0xf0) {
925 // No need for (c&0xf) because the upper bits are truncated
926 // after <<12 in the cast to uint16_t.
927 c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
928 ++p;
929 c |= *p & 0x3f;
930 ++p;
931 if constexpr (isMultiPass) {
932 return {c, 3, p0, p};
933 } else {
934 return {c, 3};
935 }
936 } else {
937 c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
938 ++p;
939 c |= (*p & 0x3f) << 6;
940 ++p;
941 c |= *p & 0x3f;
942 ++p;
943 if constexpr (isMultiPass) {
944 return {c, 4, p0, p};
945 } else {
946 return {c, 4};
947 }
948 }
949 }
950
951 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
952 // Very similar to U8_PREV_UNSAFE().
953 UnitIter p0 = p;
954 CP32 c = uint8_t(*--p);
955 if (U8_IS_SINGLE(c)) {
956 return {c, 1, p, p0};
957 }
958 // U8_IS_TRAIL(c) if well-formed
959 c &= 0x3f;
960 uint8_t count = 1;
961 for (uint8_t shift = 6;;) {
962 uint8_t b = *--p;
963 if (b >= 0xc0) {
964 U8_MASK_LEAD_BYTE(b, count);
965 c |= uint32_t{b} << shift;
966 break;
967 } else {
968 c |= (uint32_t{b} & 0x3f) << shift;
969 ++count;
970 shift += 6;
971 }
972 }
973 ++count;
974 return {c, count, p, p0};
975 }
976};
977
978// UTF-16
979template<typename CP32, typename UnitIter>
980class UnsafeUTFImpl<
981 CP32,
982 UnitIter,
983 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
984 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
985public:
986 U_FORCE_INLINE static void inc(UnitIter &p) {
987 // Very similar to U16_FWD_1_UNSAFE().
988 auto c = *p;
989 ++p;
990 if (U16_IS_LEAD(c)) {
991 ++p;
992 }
993 }
994
995 U_FORCE_INLINE static void dec(UnitIter &p) {
996 // Very similar to U16_BACK_1_UNSAFE().
997 if (U16_IS_TRAIL(*--p)) {
998 --p;
999 }
1000 }
1001
1002 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1003 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1004 // Very similar to U16_NEXT_UNSAFE().
1005 CP32 c = static_cast<CP32>(*p);
1006 ++p;
1007 if (!U16_IS_LEAD(c)) {
1008 if constexpr (isMultiPass) {
1009 return {c, 1, p0, p};
1010 } else {
1011 return {c, 1};
1012 }
1013 } else {
1014 uint16_t c2 = *p;
1015 ++p;
1016 c = U16_GET_SUPPLEMENTARY(c, c2);
1017 if constexpr (isMultiPass) {
1018 return {c, 2, p0, p};
1019 } else {
1020 return {c, 2};
1021 }
1022 }
1023 }
1024
1025 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1026 // Very similar to U16_PREV_UNSAFE().
1027 UnitIter p0 = p;
1028 CP32 c = static_cast<CP32>(*--p);
1029 if (!U16_IS_TRAIL(c)) {
1030 return {c, 1, p, p0};
1031 } else {
1032 uint16_t c2 = *--p;
1033 c = U16_GET_SUPPLEMENTARY(c2, c);
1034 return {c, 2, p, p0};
1035 }
1036 }
1037};
1038
1039// UTF-32: trivial
1040template<typename CP32, typename UnitIter>
1041class UnsafeUTFImpl<
1042 CP32,
1043 UnitIter,
1044 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1045 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1046public:
1047 U_FORCE_INLINE static void inc(UnitIter &p) {
1048 ++p;
1049 }
1050
1051 U_FORCE_INLINE static void dec(UnitIter &p) {
1052 --p;
1053 }
1054
1055 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1056 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1057 CP32 c = *p;
1058 ++p;
1059 if constexpr (isMultiPass) {
1060 return {c, 1, p0, p};
1061 } else {
1062 return {c, 1};
1063 }
1064 }
1065
1066 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1067 UnitIter p0 = p;
1068 CP32 c = *--p;
1069 return {c, 1, p, p0};
1070 }
1071};
1072
1073#endif
1074
1075// Validating iterators ---------------------------------------------------- ***
1076
1100template<typename CP32, UTFIllFormedBehavior behavior,
1101 typename UnitIter, typename LimitIter = UnitIter, typename = void>
1103 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1104 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1105
1106 // Proxy type for operator->() (required by LegacyInputIterator)
1107 // so that we don't promise always returning CodeUnits.
1108 class Proxy {
1109 public:
1110 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1111 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1112 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1113 private:
1115 };
1116
1117public:
1123 using pointer = Proxy;
1127 using iterator_category = std::conditional_t<
1129 std::bidirectional_iterator_tag,
1130 std::forward_iterator_tag>;
1131
1145 U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1146 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1147
1158 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1159 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1160
1171 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1177 U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1178
1180 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1183
1185 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1188
1194 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1195 return getLogicalPosition() == other.getLogicalPosition();
1196 }
1197
1202 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1203
1204 // Asymmetric equality & nonequality with a sentinel type.
1205
1212 template<typename Sentinel> U_FORCE_INLINE friend
1213 std::enable_if_t<
1214 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1215 bool>
1216 operator==(const UTFIterator &iter, const Sentinel &s) {
1217 return iter.getLogicalPosition() == s;
1218 }
1219
1220#if U_CPLUSPLUS_VERSION < 20
1221 // C++17: Need to define all four combinations of == / != vs. parameter order.
1222 // Once we require C++20, we could remove all but the first == because
1223 // the compiler would generate the rest.
1224
1231 template<typename Sentinel> U_FORCE_INLINE friend
1232 std::enable_if_t<
1233 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1234 bool>
1235 operator==(const Sentinel &s, const UTFIterator &iter) {
1236 return iter.getLogicalPosition() == s;
1237 }
1238
1244 template<typename Sentinel> U_FORCE_INLINE friend
1245 std::enable_if_t<
1246 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1247 bool>
1248 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1255 template<typename Sentinel> U_FORCE_INLINE friend
1256 std::enable_if_t<
1257 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1258 bool>
1259 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1260#endif // C++17
1261
1269 if (state_ == 0) {
1270 UnitIter p0 = p_;
1271 units_ = Impl::readAndInc(p0, p_, limit_);
1272 state_ = 1;
1273 }
1274 return units_;
1275 }
1276
1286 if (state_ == 0) {
1287 UnitIter p0 = p_;
1288 units_ = Impl::readAndInc(p0, p_, limit_);
1289 state_ = 1;
1290 }
1291 return Proxy(units_);
1292 }
1293
1301 if (state_ > 0) {
1302 // operator*() called readAndInc() so p_ is already ahead.
1303 state_ = 0;
1304 } else if (state_ == 0) {
1305 Impl::inc(p_, limit_);
1306 } else /* state_ < 0 */ {
1307 // operator--() called decAndRead() so we know how far to skip.
1308 p_ = units_.end();
1309 state_ = 0;
1310 }
1311 return *this;
1312 }
1313
1322 U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1323 if (state_ > 0) {
1324 // operator*() called readAndInc() so p_ is already ahead.
1325 UTFIterator result(*this);
1326 state_ = 0;
1327 return result;
1328 } else if (state_ == 0) {
1329 UnitIter p0 = p_;
1330 units_ = Impl::readAndInc(p0, p_, limit_);
1331 UTFIterator result(*this);
1332 result.state_ = 1;
1333 // keep this->state_ == 0
1334 return result;
1335 } else /* state_ < 0 */ {
1336 UTFIterator result(*this);
1337 // operator--() called decAndRead() so we know how far to skip.
1338 p_ = units_.end();
1339 state_ = 0;
1340 return result;
1341 }
1342 }
1343
1351 template<typename Iter = UnitIter>
1353 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1354 operator--() { // pre-decrement
1355 if (state_ > 0) {
1356 // operator*() called readAndInc() so p_ is ahead of the logical position.
1357 p_ = units_.begin();
1358 }
1359 units_ = Impl::decAndRead(start_, p_);
1360 state_ = -1;
1361 return *this;
1362 }
1363
1371 template<typename Iter = UnitIter>
1373 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1374 operator--(int) { // post-decrement
1375 UTFIterator result(*this);
1376 operator--();
1377 return result;
1378 }
1379
1380private:
1381 friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1382
1383 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1384 return state_ <= 0 ? p_ : units_.begin();
1385 }
1386
1387 // operator*() etc. are logically const.
1388 mutable UnitIter p_;
1389 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1390 // (forward or backward) we can test if there are enough code units.
1391 UnitIter start_;
1392 LimitIter limit_;
1393 // Keep state so that we call readAndInc() only once for both operator*() and ++
1394 // to make it easy for the compiler to optimize.
1395 mutable CodeUnits<CP32, UnitIter> units_;
1396 // >0: units_ = readAndInc(), p_ = units limit
1397 // which means that p_ is ahead of its logical position
1398 // 0: initial state
1399 // <0: units_ = decAndRead(), p_ = units start
1400 mutable int8_t state_ = 0;
1401};
1402
1403#ifndef U_IN_DOXYGEN
1404// Partial template specialization for single-pass input iterator.
1405template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1406class UTFIterator<
1407 CP32, behavior,
1408 UnitIter, LimitIter,
1409 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1410 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1411 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1412
1413 // Proxy type for post-increment return value, to make *iter++ work.
1414 // Also for operator->() (required by LegacyInputIterator)
1415 // so that we don't promise always returning CodeUnits.
1416 class Proxy {
1417 public:
1418 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1419 CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1420 CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1421 private:
1422 CodeUnits<CP32, UnitIter> units_;
1423 };
1424
1425public:
1426 using value_type = CodeUnits<CP32, UnitIter>;
1427 using reference = value_type;
1428 using pointer = Proxy;
1429 using difference_type = prv::iter_difference_t<UnitIter>;
1430 using iterator_category = std::input_iterator_tag;
1431
1432 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1433
1434 // Constructs an iterator start or limit sentinel.
1435 // Requires p to be copyable.
1436 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1437
1438 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1439 U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1440
1441 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1442 U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1443
1444 U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1445 return p_ == other.p_ && ahead_ == other.ahead_;
1446 // Strictly speaking, we should check if the logical position is the same.
1447 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1448 }
1449 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1450
1451 template<typename Sentinel> U_FORCE_INLINE friend
1452 std::enable_if_t<
1453 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1454 bool>
1455 operator==(const UTFIterator &iter, const Sentinel &s) {
1456 return !iter.ahead_ && iter.p_ == s;
1457 }
1458
1459#if U_CPLUSPLUS_VERSION < 20
1460 template<typename Sentinel> U_FORCE_INLINE friend
1461 std::enable_if_t<
1462 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1463 bool>
1464 operator==(const Sentinel &s, const UTFIterator &iter) {
1465 return !iter.ahead_ && iter.p_ == s;
1466 }
1467
1468 template<typename Sentinel> U_FORCE_INLINE friend
1469 std::enable_if_t<
1470 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1471 bool>
1472 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1473
1474 template<typename Sentinel> U_FORCE_INLINE friend
1475 std::enable_if_t<
1476 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1477 bool>
1478 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1479#endif // C++17
1480
1481 U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1482 if (!ahead_) {
1483 units_ = Impl::readAndInc(p_, p_, limit_);
1484 ahead_ = true;
1485 }
1486 return units_;
1487 }
1488
1489 U_FORCE_INLINE Proxy operator->() const {
1490 if (!ahead_) {
1491 units_ = Impl::readAndInc(p_, p_, limit_);
1492 ahead_ = true;
1493 }
1494 return Proxy(units_);
1495 }
1496
1497 U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1498 if (ahead_) {
1499 // operator*() called readAndInc() so p_ is already ahead.
1500 ahead_ = false;
1501 } else {
1502 Impl::inc(p_, limit_);
1503 }
1504 return *this;
1505 }
1506
1507 U_FORCE_INLINE Proxy operator++(int) { // post-increment
1508 if (ahead_) {
1509 // operator*() called readAndInc() so p_ is already ahead.
1510 ahead_ = false;
1511 } else {
1512 units_ = Impl::readAndInc(p_, p_, limit_);
1513 // keep this->ahead_ == false
1514 }
1515 return Proxy(units_);
1516 }
1517
1518private:
1519 // operator*() etc. are logically const.
1520 mutable UnitIter p_;
1521 // In a validating iterator, we need limit_ so that when we read a code point
1522 // we can test if there are enough code units.
1523 LimitIter limit_;
1524 // Keep state so that we call readAndInc() only once for both operator*() and ++
1525 // so that we can use a single-pass input iterator for UnitIter.
1526 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1527 // true: units_ = readAndInc(), p_ = units limit
1528 // which means that p_ is ahead of its logical position
1529 // false: initial state
1530 mutable bool ahead_ = false;
1531};
1532#endif // U_IN_DOXYGEN
1533
1534} // namespace U_HEADER_ONLY_NAMESPACE
1535
1536#ifndef U_IN_DOXYGEN
1537// Bespoke specialization of reverse_iterator.
1538// The default implementation implements reverse operator*() and ++ in a way
1539// that does most of the same work twice for reading variable-length sequences.
1540template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1541class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1542 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1543 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1544 using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>;
1545
1546 // Proxy type for operator->() (required by LegacyInputIterator)
1547 // so that we don't promise always returning CodeUnits.
1548 class Proxy {
1549 public:
1550 explicit Proxy(CodeUnits_ units) : units_(units) {}
1551 CodeUnits_ &operator*() { return units_; }
1552 CodeUnits_ *operator->() { return &units_; }
1553 private:
1554 CodeUnits_ units_;
1555 };
1556
1557public:
1558 using value_type = CodeUnits_;
1559 using reference = value_type;
1560 using pointer = Proxy;
1561 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
1562 using iterator_category = std::bidirectional_iterator_tag;
1563
1564 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) :
1565 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1566 units_(0, 0, false, p_, p_) {}
1567 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1568
1569 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1570 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1571
1572 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1573 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1574
1575 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1576 return getLogicalPosition() == other.getLogicalPosition();
1577 }
1578 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1579
1580 U_FORCE_INLINE CodeUnits_ operator*() const {
1581 if (state_ == 0) {
1582 units_ = Impl::decAndRead(start_, p_);
1583 state_ = -1;
1584 }
1585 return units_;
1586 }
1587
1588 U_FORCE_INLINE Proxy operator->() const {
1589 if (state_ == 0) {
1590 units_ = Impl::decAndRead(start_, p_);
1591 state_ = -1;
1592 }
1593 return Proxy(units_);
1594 }
1595
1596 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1597 if (state_ < 0) {
1598 // operator*() called decAndRead() so p_ is already behind.
1599 state_ = 0;
1600 } else if (state_ == 0) {
1601 Impl::dec(start_, p_);
1602 } else /* state_ > 0 */ {
1603 // operator--() called readAndInc() so we know how far to skip.
1604 p_ = units_.begin();
1605 state_ = 0;
1606 }
1607 return *this;
1608 }
1609
1610 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1611 if (state_ < 0) {
1612 // operator*() called decAndRead() so p_ is already behind.
1613 reverse_iterator result(*this);
1614 state_ = 0;
1615 return result;
1616 } else if (state_ == 0) {
1617 units_ = Impl::decAndRead(start_, p_);
1618 reverse_iterator result(*this);
1619 result.state_ = -1;
1620 // keep this->state_ == 0
1621 return result;
1622 } else /* state_ > 0 */ {
1623 reverse_iterator result(*this);
1624 // operator--() called readAndInc() so we know how far to skip.
1625 p_ = units_.begin();
1626 state_ = 0;
1627 return result;
1628 }
1629 }
1630
1631 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1632 if (state_ < 0) {
1633 // operator*() called decAndRead() so p_ is behind the logical position.
1634 p_ = units_.end();
1635 }
1636 UnitIter p0 = p_;
1637 units_ = Impl::readAndInc(p0, p_, limit_);
1638 state_ = 1;
1639 return *this;
1640 }
1641
1642 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1643 reverse_iterator result(*this);
1644 operator--();
1645 return result;
1646 }
1647
1648private:
1649 U_FORCE_INLINE UnitIter getLogicalPosition() const {
1650 return state_ >= 0 ? p_ : units_.end();
1651 }
1652
1653 // operator*() etc. are logically const.
1654 mutable UnitIter p_;
1655 // In a validating iterator, we need start_ & limit_ so that when we read a code point
1656 // (forward or backward) we can test if there are enough code units.
1657 UnitIter start_;
1658 UnitIter limit_;
1659 // Keep state so that we call decAndRead() only once for both operator*() and ++
1660 // to make it easy for the compiler to optimize.
1661 mutable CodeUnits_ units_;
1662 // >0: units_ = readAndInc(), p_ = units limit
1663 // 0: initial state
1664 // <0: units_ = decAndRead(), p_ = units start
1665 // which means that p_ is behind its logical position
1666 mutable int8_t state_ = 0;
1667};
1668#endif // U_IN_DOXYGEN
1669
1670namespace U_HEADER_ONLY_NAMESPACE {
1671
1694template<typename CP32, UTFIllFormedBehavior behavior,
1695 typename UnitIter, typename LimitIter = UnitIter>
1696auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1698 std::move(start), std::move(p), std::move(limit));
1699}
1700
1721template<typename CP32, UTFIllFormedBehavior behavior,
1722 typename UnitIter, typename LimitIter = UnitIter>
1723auto utfIterator(UnitIter p, LimitIter limit) {
1725 std::move(p), std::move(limit));
1726}
1727
1728// Note: We should only enable the following factory function for a copyable UnitIter.
1729// In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1730// but a function template partial specialization is not allowed.
1731// In C++20, we might be able to require the std::copyable concept.
1732
1752template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1753auto utfIterator(UnitIter p) {
1754 return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1755}
1756
1784template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1786 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1787public:
1793
1799 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1800 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1809 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1810 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1811
1814
1817
1822 auto begin() {
1823 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1824 }
1825
1830 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1831 auto begin() const {
1832 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1833 }
1834
1839 auto end() {
1840 using UnitIter = decltype(unitRange.begin());
1841 using LimitIter = decltype(unitRange.end());
1842 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1843 // Return the code unit sentinel.
1844 return unitRange.end();
1845 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1846 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1847 } else {
1848 // The input iterator specialization has no three-argument constructor.
1849 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1850 }
1851 }
1852
1857 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1858 auto end() const {
1859 using UnitIter = decltype(unitRange.begin());
1860 using LimitIter = decltype(unitRange.end());
1861 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1862 // Return the code unit sentinel.
1863 return unitRange.end();
1864 } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1865 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1866 } else {
1867 // The input iterator specialization has no three-argument constructor.
1868 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1869 }
1870 }
1871
1876 auto rbegin() const {
1877 return std::make_reverse_iterator(end());
1878 }
1879
1884 auto rend() const {
1885 return std::make_reverse_iterator(begin());
1886 }
1887
1888private:
1889 Range unitRange;
1890};
1891
1893template<typename CP32, UTFIllFormedBehavior behavior>
1895#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1896 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1897 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1898#endif
1899{
1901 template<typename Range>
1902 auto operator()(Range &&unitRange) const {
1903#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1905 std::forward<Range>(unitRange));
1906#else
1908 // Take basic_string_view by copy, not by reference. In C++20 this is handled by
1909 // all_t<Range>, which is Range if Range is a view.
1911 std::forward<Range>(unitRange));
1912 } else {
1913 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1914 }
1915#endif
1916 }
1917};
1918
1933template<typename CP32, UTFIllFormedBehavior behavior>
1935
1936// Non-validating iterators ------------------------------------------------ ***
1937
1959template<typename CP32, typename UnitIter, typename = void>
1961 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1962 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1963
1964 // Proxy type for operator->() (required by LegacyInputIterator)
1965 // so that we don't promise always returning UnsafeCodeUnits.
1966 class Proxy {
1967 public:
1968 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1969 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1970 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1971 private:
1973 };
1974
1975public:
1981 using pointer = Proxy;
1985 using iterator_category = std::conditional_t<
1987 std::bidirectional_iterator_tag,
1988 std::forward_iterator_tag>;
1989
1999 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
2005 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
2006
2011
2016
2023 return getLogicalPosition() == other.getLogicalPosition();
2024 }
2025
2030 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2031
2038 template<typename Sentinel> U_FORCE_INLINE friend
2039 std::enable_if_t<
2040 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2041 bool>
2042 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2043 return iter.getLogicalPosition() == s;
2044 }
2045
2046#if U_CPLUSPLUS_VERSION < 20
2053 template<typename Sentinel> U_FORCE_INLINE friend
2054 std::enable_if_t<
2055 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2056 bool>
2057 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2058 return iter.getLogicalPosition() == s;
2059 }
2060
2066 template<typename Sentinel> U_FORCE_INLINE friend
2067 std::enable_if_t<
2068 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2069 bool>
2070 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2077 template<typename Sentinel> U_FORCE_INLINE friend
2078 std::enable_if_t<
2079 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2080 bool>
2081 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2082#endif // C++17
2083
2091 if (state_ == 0) {
2092 UnitIter p0 = p_;
2093 units_ = Impl::readAndInc(p0, p_);
2094 state_ = 1;
2095 }
2096 return units_;
2097 }
2098
2108 if (state_ == 0) {
2109 UnitIter p0 = p_;
2110 units_ = Impl::readAndInc(p0, p_);
2111 state_ = 1;
2112 }
2113 return Proxy(units_);
2114 }
2115
2123 if (state_ > 0) {
2124 // operator*() called readAndInc() so p_ is already ahead.
2125 state_ = 0;
2126 } else if (state_ == 0) {
2127 Impl::inc(p_);
2128 } else /* state_ < 0 */ {
2129 // operator--() called decAndRead() so we know how far to skip.
2130 p_ = units_.end();
2131 state_ = 0;
2132 }
2133 return *this;
2134 }
2135
2145 if (state_ > 0) {
2146 // operator*() called readAndInc() so p_ is already ahead.
2147 UnsafeUTFIterator result(*this);
2148 state_ = 0;
2149 return result;
2150 } else if (state_ == 0) {
2151 UnitIter p0 = p_;
2152 units_ = Impl::readAndInc(p0, p_);
2153 UnsafeUTFIterator result(*this);
2154 result.state_ = 1;
2155 // keep this->state_ == 0
2156 return result;
2157 } else /* state_ < 0 */ {
2158 UnsafeUTFIterator result(*this);
2159 // operator--() called decAndRead() so we know how far to skip.
2160 p_ = units_.end();
2161 state_ = 0;
2162 return result;
2163 }
2164 }
2165
2173 template<typename Iter = UnitIter>
2175 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2176 operator--() { // pre-decrement
2177 if (state_ > 0) {
2178 // operator*() called readAndInc() so p_ is ahead of the logical position.
2179 p_ = units_.begin();
2180 }
2181 units_ = Impl::decAndRead(p_);
2182 state_ = -1;
2183 return *this;
2184 }
2185
2193 template<typename Iter = UnitIter>
2195 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2196 operator--(int) { // post-decrement
2197 UnsafeUTFIterator result(*this);
2198 operator--();
2199 return result;
2200 }
2201
2202private:
2203 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2204
2205 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2206 return state_ <= 0 ? p_ : units_.begin();
2207 }
2208
2209 // operator*() etc. are logically const.
2210 mutable UnitIter p_;
2211 // Keep state so that we call readAndInc() only once for both operator*() and ++
2212 // to make it easy for the compiler to optimize.
2213 mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2214 // >0: units_ = readAndInc(), p_ = units limit
2215 // which means that p_ is ahead of its logical position
2216 // 0: initial state
2217 // <0: units_ = decAndRead(), p_ = units start
2218 mutable int8_t state_ = 0;
2219};
2220
2221#ifndef U_IN_DOXYGEN
2222// Partial template specialization for single-pass input iterator.
2223template<typename CP32, typename UnitIter>
2224class UnsafeUTFIterator<
2225 CP32,
2226 UnitIter,
2227 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2228 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2229 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2230
2231 // Proxy type for post-increment return value, to make *iter++ work.
2232 // Also for operator->() (required by LegacyInputIterator)
2233 // so that we don't promise always returning UnsafeCodeUnits.
2234 class Proxy {
2235 public:
2236 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2237 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2238 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2239 private:
2240 UnsafeCodeUnits<CP32, UnitIter> units_;
2241 };
2242
2243public:
2244 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2245 using reference = value_type;
2246 using pointer = Proxy;
2247 using difference_type = prv::iter_difference_t<UnitIter>;
2248 using iterator_category = std::input_iterator_tag;
2249
2250 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2251
2252 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2254
2255 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2257
2258 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2259 return p_ == other.p_ && ahead_ == other.ahead_;
2260 // Strictly speaking, we should check if the logical position is the same.
2261 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2262 }
2263 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2264
2265 template<typename Sentinel> U_FORCE_INLINE friend
2266 std::enable_if_t<
2267 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2268 bool>
2269 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2270 return !iter.ahead_ && iter.p_ == s;
2271 }
2272
2273#if U_CPLUSPLUS_VERSION < 20
2274 template<typename Sentinel> U_FORCE_INLINE friend
2275 std::enable_if_t<
2276 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2277 bool>
2278 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2279 return !iter.ahead_ && iter.p_ == s;
2280 }
2281
2282 template<typename Sentinel> U_FORCE_INLINE friend
2283 std::enable_if_t<
2284 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2285 bool>
2286 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2287
2288 template<typename Sentinel> U_FORCE_INLINE friend
2289 std::enable_if_t<
2290 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2291 bool>
2292 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2293#endif // C++17
2294
2295 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2296 if (!ahead_) {
2297 units_ = Impl::readAndInc(p_, p_);
2298 ahead_ = true;
2299 }
2300 return units_;
2301 }
2302
2303 U_FORCE_INLINE Proxy operator->() const {
2304 if (!ahead_) {
2305 units_ = Impl::readAndInc(p_, p_);
2306 ahead_ = true;
2307 }
2308 return Proxy(units_);
2309 }
2310
2311 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2312 if (ahead_) {
2313 // operator*() called readAndInc() so p_ is already ahead.
2314 ahead_ = false;
2315 } else {
2316 Impl::inc(p_);
2317 }
2318 return *this;
2319 }
2320
2321 U_FORCE_INLINE Proxy operator++(int) { // post-increment
2322 if (ahead_) {
2323 // operator*() called readAndInc() so p_ is already ahead.
2324 ahead_ = false;
2325 } else {
2326 units_ = Impl::readAndInc(p_, p_);
2327 // keep this->ahead_ == false
2328 }
2329 return Proxy(units_);
2330 }
2331
2332private:
2333 // operator*() etc. are logically const.
2334 mutable UnitIter p_;
2335 // Keep state so that we call readAndInc() only once for both operator*() and ++
2336 // so that we can use a single-pass input iterator for UnitIter.
2337 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2338 // true: units_ = readAndInc(), p_ = units limit
2339 // which means that p_ is ahead of its logical position
2340 // false: initial state
2341 mutable bool ahead_ = false;
2342};
2343#endif // U_IN_DOXYGEN
2344
2345} // namespace U_HEADER_ONLY_NAMESPACE
2346
2347#ifndef U_IN_DOXYGEN
2348// Bespoke specialization of reverse_iterator.
2349// The default implementation implements reverse operator*() and ++ in a way
2350// that does most of the same work twice for reading variable-length sequences.
2351template<typename CP32, typename UnitIter>
2352class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2353 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2354 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2355 using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>;
2356
2357 // Proxy type for operator->() (required by LegacyInputIterator)
2358 // so that we don't promise always returning UnsafeCodeUnits.
2359 class Proxy {
2360 public:
2361 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2362 UnsafeCodeUnits_ &operator*() { return units_; }
2363 UnsafeCodeUnits_ *operator->() { return &units_; }
2364 private:
2365 UnsafeCodeUnits_ units_;
2366 };
2367
2368public:
2369 using value_type = UnsafeCodeUnits_;
2370 using reference = value_type;
2371 using pointer = Proxy;
2372 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
2373 using iterator_category = std::bidirectional_iterator_tag;
2374
2375 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2376 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2377 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2378
2379 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2380 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2381
2382 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2383 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2384
2385 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2386 return getLogicalPosition() == other.getLogicalPosition();
2387 }
2388 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2389
2390 U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2391 if (state_ == 0) {
2392 units_ = Impl::decAndRead(p_);
2393 state_ = -1;
2394 }
2395 return units_;
2396 }
2397
2398 U_FORCE_INLINE Proxy operator->() const {
2399 if (state_ == 0) {
2400 units_ = Impl::decAndRead(p_);
2401 state_ = -1;
2402 }
2403 return Proxy(units_);
2404 }
2405
2406 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2407 if (state_ < 0) {
2408 // operator*() called decAndRead() so p_ is already behind.
2409 state_ = 0;
2410 } else if (state_ == 0) {
2411 Impl::dec(p_);
2412 } else /* state_ > 0 */ {
2413 // operator--() called readAndInc() so we know how far to skip.
2414 p_ = units_.begin();
2415 state_ = 0;
2416 }
2417 return *this;
2418 }
2419
2420 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2421 if (state_ < 0) {
2422 // operator*() called decAndRead() so p_ is already behind.
2423 reverse_iterator result(*this);
2424 state_ = 0;
2425 return result;
2426 } else if (state_ == 0) {
2427 units_ = Impl::decAndRead(p_);
2428 reverse_iterator result(*this);
2429 result.state_ = -1;
2430 // keep this->state_ == 0
2431 return result;
2432 } else /* state_ > 0 */ {
2433 reverse_iterator result(*this);
2434 // operator--() called readAndInc() so we know how far to skip.
2435 p_ = units_.begin();
2436 state_ = 0;
2437 return result;
2438 }
2439 }
2440
2441 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2442 if (state_ < 0) {
2443 // operator*() called decAndRead() so p_ is behind the logical position.
2444 p_ = units_.end();
2445 }
2446 UnitIter p0 = p_;
2447 units_ = Impl::readAndInc(p0, p_);
2448 state_ = 1;
2449 return *this;
2450 }
2451
2452 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2453 reverse_iterator result(*this);
2454 operator--();
2455 return result;
2456 }
2457
2458private:
2459 U_FORCE_INLINE UnitIter getLogicalPosition() const {
2460 return state_ >= 0 ? p_ : units_.end();
2461 }
2462
2463 // operator*() etc. are logically const.
2464 mutable UnitIter p_;
2465 // Keep state so that we call decAndRead() only once for both operator*() and ++
2466 // to make it easy for the compiler to optimize.
2467 mutable UnsafeCodeUnits_ units_;
2468 // >0: units_ = readAndInc(), p_ = units limit
2469 // 0: initial state
2470 // <0: units_ = decAndRead(), p_ = units start
2471 // which means that p_ is behind its logical position
2472 mutable int8_t state_ = 0;
2473};
2474#endif // U_IN_DOXYGEN
2475
2476namespace U_HEADER_ONLY_NAMESPACE {
2477
2493template<typename CP32, typename UnitIter>
2494auto unsafeUTFIterator(UnitIter iter) {
2495 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2496}
2497
2525template<typename CP32, typename Range>
2527 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2528public:
2534
2540 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2541 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2550 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2551 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2552
2555
2558
2563 auto begin() {
2564 return unsafeUTFIterator<CP32>(unitRange.begin());
2565 }
2566
2571 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2572 auto begin() const {
2573 return unsafeUTFIterator<CP32>(unitRange.begin());
2574 }
2575
2580 auto end() {
2581 using UnitIter = decltype(unitRange.begin());
2582 using LimitIter = decltype(unitRange.end());
2583 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2584 // Return the code unit sentinel.
2585 return unitRange.end();
2586 } else {
2587 return unsafeUTFIterator<CP32>(unitRange.end());
2588 }
2589 }
2590
2595 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2596 auto end() const {
2597 using UnitIter = decltype(unitRange.begin());
2598 using LimitIter = decltype(unitRange.end());
2599 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2600 // Return the code unit sentinel.
2601 return unitRange.end();
2602 } else {
2603 return unsafeUTFIterator<CP32>(unitRange.end());
2604 }
2605 }
2606
2611 auto rbegin() const {
2612 return std::make_reverse_iterator(end());
2613 }
2614
2619 auto rend() const {
2620 return std::make_reverse_iterator(begin());
2621 }
2622
2623private:
2624 Range unitRange;
2625};
2626
2628template<typename CP32>
2630#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2631 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2632 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2633#endif
2634{
2636 template<typename Range>
2637 auto operator()(Range &&unitRange) const {
2638#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2639 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2640#else
2642 // Take basic_string_view by copy, not by reference. In C++20 this is handled by
2643 // all_t<Range>, which is Range if Range is a view.
2644 return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2645 } else {
2646 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2647 }
2648#endif
2649 }
2650};
2651
2652
2665template<typename CP32>
2667
2668} // namespace U_HEADER_ONLY_NAMESPACE
2669
2670
2671#if defined(__cpp_lib_ranges)
2672template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
2673constexpr bool std::ranges::enable_borrowed_range<
2675 std::ranges::enable_borrowed_range<Range>;
2676
2677template <typename CP32, typename Range>
2678constexpr bool std::ranges::enable_borrowed_range<
2680 std::ranges::enable_borrowed_range<Range>;
2681#endif
2682
2683#endif // U_HIDE_DRAFT_API
2684#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2685#endif // __UTFITERATOR_H__
Result of validating and decoding a code unit sequence for one code point.
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Validating iterator over the code points in a Unicode string.
U_FORCE_INLINE UTFIterator()
Default constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
value_type reference
C++ iterator boilerplate.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Proxy pointer
C++ iterator boilerplate.
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
A C++ "range" for validating iteration over all of the code points of a code unit range.
UTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UTFStringCodePoints & operator=(const UTFStringCodePoints &other)=default
Copy assignment operator.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
UTFStringCodePoints(const UTFStringCodePoints &other)=default
Copy constructor.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Result of decoding a code unit sequence for one code point.
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
Non-validating iterator over the code points in a Unicode string.
U_FORCE_INLINE UnsafeUTFIterator()
Default constructor.
UnsafeCodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UnsafeUTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator > operator--(int)
Post-decrement operator.
value_type reference
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UnsafeUTFIterator &iter)
U_FORCE_INLINE UnsafeUTFIterator & operator++()
Pre-increment operator.
Proxy pointer
C++ iterator boilerplate.
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UnsafeUTFIterator &iter)
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE UnsafeUTFIterator operator++(int)
Post-increment operator.
U_FORCE_INLINE UnsafeUTFIterator & operator=(UnsafeUTFIterator &&src) noexcept=default
Move assignment operator.
U_FORCE_INLINE UnsafeCodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UnsafeUTFIterator &iter, const Sentinel &s)
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UnsafeUTFIterator & operator=(const UnsafeUTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE UnsafeUTFIterator(UnitIter p)
Constructor; the iterator/pointer should be at a code point boundary.
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator & > operator--()
Pre-decrement operator.
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const
A C++ "range" for non-validating iteration over all of the code points of a code unit range.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other)=default
Copy constructor.
UnsafeUTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UnsafeUTFStringCodePoints & operator=(const UnsafeUTFStringCodePoints &other)=default
Copy assignment operator.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
int32_t difference_type
C++ iterator boilerplate.
bool operator==(const CodePointsIterator &other) const
bool operator!=(const CodePointsIterator &other) const
value_type reference
C++ iterator boilerplate.
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
CP32 * pointer
C++ iterator boilerplate.
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ... if C++.
Definition platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte?
Definition utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte?
Definition utf8.h:190
auto unsafeUTFIterator(UnitIter iter)
UnsafeUTFIterator factory function.
typename std::iterator_traits< Iter >::value_type iter_value_t
constexpr bool is_basic_string_view_v
constexpr bool forward_iterator
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit)
UTFIterator factory function for start <= p < limit.
constexpr UTFStringCodePointsAdaptor< CP32, behavior > utfStringCodePoints
Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of cod...
typename std::iterator_traits< Iter >::difference_type iter_difference_t
constexpr bool bidirectional_iterator
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > unsafeUTFStringCodePoints
Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range"...
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.