ICU 78.3 78.3
Loading...
Searching...
No Matches
utfstring.h
Go to the documentation of this file.
1// © 2025 and later: Unicode, Inc. and others.
2// License & terms of use: https://www.unicode.org/copyright.html
3
4// utfstring.h
5// created: 2025jul18 Markus W. Scherer
6
7#ifndef __UTFSTRING_H__
8#define __UTFSTRING_H__
9
10#include "unicode/utypes.h"
11
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13
14#include "unicode/utf16.h"
15
20
21#ifndef U_HIDE_DRAFT_API
22
23namespace U_HEADER_ONLY_NAMESPACE {
24namespace utfstring {
25
26// Write code points to strings -------------------------------------------- ***
27
28#ifndef U_IN_DOXYGEN
29namespace prv {
30
31// This function, and the public wrappers,
32// want to be U_FORCE_INLINE but the gcc-debug-build-and-test CI check failed with
33// error: ‘always_inline’ function might not be inlinable [-Werror=attributes]
34template<typename StringClass, bool validate>
35inline StringClass &appendCodePoint(StringClass &s, uint32_t c) {
36 using Unit = typename StringClass::value_type;
37 if constexpr (sizeof(Unit) == 1) {
38 // UTF-8: Similar to U8_APPEND().
39 if (c <= 0x7f) {
40 s.push_back(static_cast<Unit>(c));
41 } else {
42 Unit buf[4];
43 uint8_t len;
44 if (c <= 0x7ff) {
45 len = 2;
46 buf[2] = (c >> 6) | 0xc0;
47 } else {
48 if (validate ?
49 c < 0xd800 ||
50 (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
51 c <= 0xffff) {
52 len = 3;
53 buf[1] = (c >> 12) | 0xe0;
54 } else {
55 len = 4;
56 buf[0] = (c >> 18) | 0xf0;
57 buf[1] = ((c >> 12) & 0x3f) | 0x80;
58 }
59 buf[2] = ((c >> 6) & 0x3f) | 0x80;
60 }
61 buf[3] = (c & 0x3f) | 0x80;
62 s.append(buf + 4 - len, len);
63 }
64 } else if constexpr (sizeof(Unit) == 2) {
65 // UTF-16: Similar to U16_APPEND().
66 if (validate ?
67 c < 0xd800 || (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
68 c <= 0xffff) {
69 s.push_back(static_cast<Unit>(c));
70 } else {
71 Unit buf[2] = { U16_LEAD(c), U16_TRAIL(c) };
72 s.append(buf, 2);
73 }
74 } else {
75 // UTF-32
76 s.push_back(!validate || U_IS_SCALAR_VALUE(c) ? c : 0xfffd);
77 }
78 return s;
79}
80
81} // namespace prv
82#endif // U_IN_DOXYGEN
83
84#ifndef U_HIDE_DRAFT_API
97template<typename StringClass>
98inline StringClass &appendOrFFFD(StringClass &s, UChar32 c) {
99 return prv::appendCodePoint<StringClass, true>(s, c);
100}
101
114template<typename StringClass>
115inline StringClass &appendUnsafe(StringClass &s, UChar32 c) {
116 return prv::appendCodePoint<StringClass, false>(s, c);
117}
118
130template<typename StringClass>
131inline StringClass encodeOrFFFD(UChar32 c) {
132 StringClass s;
133 prv::appendCodePoint<StringClass, true>(s, c);
134 return s;
135}
136
148template<typename StringClass>
149inline StringClass encodeUnsafe(UChar32 c) {
150 StringClass s;
151 prv::appendCodePoint<StringClass, false>(s, c);
152 return s;
153}
154#endif // U_HIDE_DRAFT_API
155
156} // namespace utfstring
157} // namespace U_HEADER_ONLY_NAMESPACE
158
159#endif // U_HIDE_DRAFT_API
160#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
161#endif // __UTFSTRING_H__
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:449
C API: 16-bit Unicode handling macros.
#define U16_TRAIL(supplementary)
Get the trail surrogate (0xdc00..0xdfff) for a supplementary code point (0x10000.....
Definition utf16.h:132
#define U16_LEAD(supplementary)
Get the lead surrogate (0xd800..0xdbff) for a supplementary code point (0x10000..0x10ffff).
Definition utf16.h:123
#define U_IS_SCALAR_VALUE(c)
Is c a Unicode scalar value, that is, a non-surrogate code point?
Definition utf.h:149
StringClass encodeUnsafe(UChar32 c)
Returns the code point as a string of code units.
Definition utfstring.h:149
StringClass & appendOrFFFD(StringClass &s, UChar32 c)
Appends the code point to the string.
Definition utfstring.h:98
StringClass & appendUnsafe(StringClass &s, UChar32 c)
Appends the code point to the string.
Definition utfstring.h:115
StringClass encodeOrFFFD(UChar32 c)
Returns the code point as a string of code units.
Definition utfstring.h:131
Basic definitions for ICU, for both C and C++ APIs.