ICU 78.3 78.3
Loading...
Searching...
No Matches
utfiterator.h File Reference

C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed). More...

#include "unicode/utypes.h"
#include <iterator>
#include <string>
#include <string_view>
#include <type_traits>
#include "unicode/utf16.h"
#include "unicode/utf8.h"
#include "unicode/uversion.h"

Go to the source code of this file.

Data Structures

struct  U_HEADER_ONLY_NAMESPACE::prv::range_type< Range, typename >
struct  U_HEADER_ONLY_NAMESPACE::prv::range_type< Range, std::void_t< decltype(std::declval< Range >().begin()), decltype(std::declval< Range >().end())> >
struct  U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view< T >
struct  U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view< std::basic_string_view< Args... > >
class  U_HEADER_ONLY_NAMESPACE::prv::CodePointsIterator< CP32, skipSurrogates >
class  U_HEADER_ONLY_NAMESPACE::AllCodePoints< CP32 >
 A C++ "range" over all Unicode code points U+0000..U+10FFFF. More...
class  U_HEADER_ONLY_NAMESPACE::AllScalarValues< CP32 >
 A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF. More...
class  U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits< CP32, UnitIter, typename >
 Result of decoding a code unit sequence for one code point. More...
class  U_HEADER_ONLY_NAMESPACE::CodeUnits< CP32, UnitIter, typename >
 Result of validating and decoding a code unit sequence for one code point. More...
class  U_HEADER_ONLY_NAMESPACE::UTFIterator< CP32, behavior, UnitIter, LimitIter, typename >
 Validating iterator over the code points in a Unicode string. More...
class  U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints< CP32, behavior, Range >
 A C++ "range" for validating iteration over all of the code points of a code unit range. More...
struct  U_HEADER_ONLY_NAMESPACE::UTFStringCodePointsAdaptor< CP32, behavior >
class  U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator< CP32, UnitIter, typename >
 Non-validating iterator over the code points in a Unicode string. More...
class  U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints< CP32, Range >
 A C++ "range" for non-validating iteration over all of the code points of a code unit range. More...
struct  U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePointsAdaptor< CP32 >

Typedefs

typedef enum UTFIllFormedBehavior UTFIllFormedBehavior
 Some defined behaviors for handling ill-formed Unicode strings.
template<typename Iter>
using U_HEADER_ONLY_NAMESPACE::prv::iter_value_t = typename std::iterator_traits<Iter>::value_type
template<typename Iter>
using U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t = typename std::iterator_traits<Iter>::difference_type

Enumerations

enum  UTFIllFormedBehavior { UTF_BEHAVIOR_NEGATIVE , UTF_BEHAVIOR_FFFD , UTF_BEHAVIOR_SURROGATE }
 Some defined behaviors for handling ill-formed Unicode strings. More...

Functions

template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter = UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator (UnitIter start, UnitIter p, LimitIter limit)
 UTFIterator factory function for start <= p < limit.
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter = UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator (UnitIter p, LimitIter limit)
 UTFIterator factory function for start = p < limit.
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator (UnitIter p)
 UTFIterator factory function for a start or limit sentinel.
template<typename CP32, typename UnitIter>
auto U_HEADER_ONLY_NAMESPACE::unsafeUTFIterator (UnitIter iter)
 UnsafeUTFIterator factory function.

Variables

template<typename Iter>
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::forward_iterator
template<typename Iter>
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::bidirectional_iterator
template<typename Range>
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::range = range_type<Range>::value
template<typename T>
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view_v = is_basic_string_view<T>::value
template<typename CP32, UTFIllFormedBehavior behavior>
constexpr UTFStringCodePointsAdaptor< CP32, behavior > U_HEADER_ONLY_NAMESPACE::utfStringCodePoints
 Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code points in a code unit range, which validates while decoding.
template<typename CP32>
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > U_HEADER_ONLY_NAMESPACE::unsafeUTFStringCodePoints
 Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range" of code points in a code unit range.

Detailed Description

C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).

Sample code:

#include <string_view>
#include <iostream>
#include "unicode/utypes.h"
using icu::header::utfIterator;
using icu::header::utfStringCodePoints;
using icu::header::unsafeUTFIterator;
using icu::header::unsafeUTFStringCodePoints;
int32_t rangeLoop16(std::u16string_view s) {
// We are just adding up the code points for minimal-code demonstration purposes.
int32_t sum = 0;
for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) {
sum += units.codePoint(); // < 0 if ill-formed
}
return sum;
}
int32_t loopIterPlusPlus16(std::u16string_view s) {
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
int32_t sum = 0;
for (auto iter = range.begin(), limit = range.end(); iter != limit;) {
sum += (*iter++).codePoint(); // U+FFFD if ill-formed
}
return sum;
}
int32_t backwardLoop16(std::u16string_view s) {
auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
int32_t sum = 0;
for (auto start = range.begin(), iter = range.end(); start != iter;) {
sum += (*--iter).codePoint(); // surrogate code point if unpaired / ill-formed
}
return sum;
}
int32_t reverseLoop8(std::string_view s) {
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
sum += iter->codePoint(); // U+FFFD if ill-formed
}
return sum;
}
int32_t countCodePoints16(std::u16string_view s) {
auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
return std::distance(range.begin(), range.end());
}
int32_t unsafeRangeLoop16(std::u16string_view s) {
int32_t sum = 0;
for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t unsafeReverseLoop8(std::string_view s) {
auto range = unsafeUTFStringCodePoints<UChar32>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
sum += iter->codePoint();
}
return sum;
}
char32_t firstCodePointOrFFFD16(std::u16string_view s) {
if (s.empty()) { return 0xfffd; }
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
return range.begin()->codePoint();
}
std::string_view firstSequence8(std::string_view s) {
if (s.empty()) { return {}; }
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
auto units = *(range.begin());
if (units.wellFormed()) {
return units.stringView();
} else {
return {};
}
}
template<typename InputStream> // some istream or streambuf
std::u32string cpFromInput(InputStream &in) {
// This is a single-pass input_iterator.
std::istreambuf_iterator bufIter(in);
std::istreambuf_iterator<typename InputStream::char_type> bufLimit;
auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter);
auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit);
std::u32string s32;
for (; iter != limit; ++iter) {
s32.push_back(iter->codePoint());
}
return s32;
}
std::u32string cpFromStdin() { return cpFromInput(std::cin); }
std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); }
C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).
Basic definitions for ICU, for both C and C++ APIs.

Definition in file utfiterator.h.

Typedef Documentation

◆ iter_difference_t

template<typename Iter>
using U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t = typename std::iterator_traits<Iter>::difference_type
Internal
Do not use. This API is for internal use only.

Definition at line 203 of file utfiterator.h.

◆ iter_value_t

template<typename Iter>
using U_HEADER_ONLY_NAMESPACE::prv::iter_value_t = typename std::iterator_traits<Iter>::value_type
Internal
Do not use. This API is for internal use only.

Definition at line 199 of file utfiterator.h.

◆ UTFIllFormedBehavior

Some defined behaviors for handling ill-formed Unicode strings.

This is a template parameter for UTFIterator and related classes.

When a validating UTFIterator encounters an ill-formed code unit sequence, then CodeUnits.codePoint() is a value according to this parameter.

Draft
This API may be changed in the future versions and was introduced in ICU 78
See also
CodeUnits
UTFIterator
UTFStringCodePoints

Enumeration Type Documentation

◆ UTFIllFormedBehavior

Some defined behaviors for handling ill-formed Unicode strings.

This is a template parameter for UTFIterator and related classes.

When a validating UTFIterator encounters an ill-formed code unit sequence, then CodeUnits.codePoint() is a value according to this parameter.

Draft
This API may be changed in the future versions and was introduced in ICU 78
See also
CodeUnits
UTFIterator
UTFStringCodePoints
Enumerator
UTF_BEHAVIOR_NEGATIVE 

Returns a negative value (-1=U_SENTINEL) instead of a code point.

If the CP32 template parameter for the relevant classes is an unsigned type, then the negative value becomes 0xffffffff=UINT32_MAX.

Draft
This API may be changed in the future versions and was introduced in ICU 78
UTF_BEHAVIOR_FFFD 

Returns U+FFFD Replacement Character.

Draft
This API may be changed in the future versions and was introduced in ICU 78
UTF_BEHAVIOR_SURROGATE 

UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point, or U+FFFD if out of range.

Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 149 of file utfiterator.h.

Function Documentation

◆ unsafeUTFIterator()

template<typename CP32, typename UnitIter>
auto U_HEADER_ONLY_NAMESPACE::unsafeUTFIterator ( UnitIter iter)

UnsafeUTFIterator factory function.

Deduces the UnitIter template parameter from the input.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t
UnitIterCan usually be omitted/deduced: An iterator (often a pointer) that returns a code unit type: UTF-8: char or char8_t or uint8_t; UTF-16: char16_t or uint16_t or (on Windows) wchar_t; UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
Parameters
itercode unit iterator
Returns
an UnsafeUTFIterator<CP32, UnitIter> for the given code unit iterator or character pointer
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 2494 of file utfiterator.h.

◆ utfIterator() [1/3]

template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator ( UnitIter p)

UTFIterator factory function for a start or limit sentinel.

Deduces the UnitIter template parameter from the input. Requires UnitIter to be copyable.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t
behaviorHow to handle ill-formed Unicode strings
UnitIterCan usually be omitted/deduced: An iterator (often a pointer) that returns a code unit type: UTF-8: char or char8_t or uint8_t; UTF-16: char16_t or uint16_t or (on Windows) wchar_t; UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
Parameters
pcode unit iterator. When using a code unit sentinel, then that sentinel also works as a sentinel for the code point iterator.
Returns
a UTFIterator<CP32, behavior, UnitIter> for the given code unit iterator or character pointer
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 1753 of file utfiterator.h.

◆ utfIterator() [2/3]

template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter = UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator ( UnitIter p,
LimitIter limit )

UTFIterator factory function for start = p < limit.

Deduces the UnitIter and LimitIter template parameters from the inputs.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t
behaviorHow to handle ill-formed Unicode strings
UnitIterCan usually be omitted/deduced: An iterator (often a pointer) that returns a code unit type: UTF-8: char or char8_t or uint8_t; UTF-16: char16_t or uint16_t or (on Windows) wchar_t; UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
LimitIterEither the same as UnitIter, or an iterator sentinel type.
Parameters
pstart and current-position code unit iterator
limitlimit (exclusive-end) code unit iterator. When using a code unit sentinel (UnitIter≠LimitIter), then that sentinel also works as a sentinel for the code point iterator.
Returns
a UTFIterator<CP32, behavior, UnitIter> for the given code unit iterators or character pointers
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 1723 of file utfiterator.h.

◆ utfIterator() [3/3]

template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter = UnitIter>
auto U_HEADER_ONLY_NAMESPACE::utfIterator ( UnitIter start,
UnitIter p,
LimitIter limit )

UTFIterator factory function for start <= p < limit.

Deduces the UnitIter and LimitIter template parameters from the inputs. Only enabled if UnitIter is a (multi-pass) forward_iterator or better.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t
behaviorHow to handle ill-formed Unicode strings
UnitIterCan usually be omitted/deduced: An iterator (often a pointer) that returns a code unit type: UTF-8: char or char8_t or uint8_t; UTF-16: char16_t or uint16_t or (on Windows) wchar_t; UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
LimitIterEither the same as UnitIter, or an iterator sentinel type.
Parameters
startstart code unit iterator
pcurrent-position code unit iterator
limitlimit (exclusive-end) code unit iterator. When using a code unit sentinel (UnitIter≠LimitIter), then that sentinel also works as a sentinel for the code point iterator.
Returns
a UTFIterator<CP32, behavior, UnitIter> for the given code unit iterators or character pointers
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 1696 of file utfiterator.h.

Variable Documentation

◆ bidirectional_iterator

template<typename Iter>
bool U_HEADER_ONLY_NAMESPACE::prv::bidirectional_iterator
constexpr
Initial value:
=
std::is_base_of_v<
std::bidirectional_iterator_tag,
typename std::iterator_traits<Iter>::iterator_category>
Internal
Do not use. This API is for internal use only.

Definition at line 214 of file utfiterator.h.

◆ forward_iterator

template<typename Iter>
bool U_HEADER_ONLY_NAMESPACE::prv::forward_iterator
constexpr
Initial value:
=
std::is_base_of_v<
std::forward_iterator_tag,
typename std::iterator_traits<Iter>::iterator_category>
Internal
Do not use. This API is for internal use only.

Definition at line 207 of file utfiterator.h.

◆ is_basic_string_view_v

template<typename T>
bool U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view_v = is_basic_string_view<T>::value
constexpr
Internal
Do not use. This API is for internal use only.

Definition at line 244 of file utfiterator.h.

◆ range

template<typename Range>
bool U_HEADER_ONLY_NAMESPACE::prv::range = range_type<Range>::value
constexpr
Internal
Do not use. This API is for internal use only.

Definition at line 232 of file utfiterator.h.

◆ unsafeUTFStringCodePoints

template<typename CP32>
UnsafeUTFStringCodePointsAdaptor<CP32> U_HEADER_ONLY_NAMESPACE::unsafeUTFStringCodePoints
constexpr

Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range" of code points in a code unit range.

The string must be well-formed. Deduces the Range template parameter from the input, taking into account the value category: the code units will be referenced if possible, and moved if necessary.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t
RangeA C++ "range" of Unicode UTF-8/16/32 code units
Parameters
unitRangeinput range
Returns
an UnsafeUTFStringCodePoints<CP32, Range> for the given unitRange
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 2666 of file utfiterator.h.

◆ utfStringCodePoints

template<typename CP32, UTFIllFormedBehavior behavior>
UTFStringCodePointsAdaptor<CP32, behavior> U_HEADER_ONLY_NAMESPACE::utfStringCodePoints
constexpr

Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code points in a code unit range, which validates while decoding.

Deduces the Range template parameter from the input, taking into account the value category: the code units will be referenced if possible, and moved if necessary.

Template Parameters
CP32Code point type: UChar32 (=int32_t) or char32_t or uint32_t; should be signed if UTF_BEHAVIOR_NEGATIVE
behaviorHow to handle ill-formed Unicode strings
RangeA C++ "range" of Unicode UTF-8/16/32 code units
Parameters
unitRangeinput range
Returns
a UTFStringCodePoints<CP32, behavior, Range> for the given unitRange
Draft
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 1934 of file utfiterator.h.