Plasma Engine  2.0
Loading...
Searching...
No Matches
UnicodeUtils_inl.h
1#pragma once
2
3/*
4You can classify bytes in a UTF-8 stream as follows:
5 With the high bit set to 0, it's a single byte value.
6 With the two high bits set to 10, it's a continuation byte (the second, third or fourth byte in a UTF-8 multi-byte sequence).
7 Otherwise, it's the first byte of a multi-byte sequence and the number of leading 1 bits indicates how many bytes there are in total for
8this sequence (110... means two bytes, 1110... means three bytes, etc).
9*/
10
11PL_ALWAYS_INLINE bool plUnicodeUtils::IsUtf8StartByte(char iByte)
12{
13 // valid utf8 start bytes are 0x0-------, 0x110-----, 0x1110----, 0x11110---, etc
14 return ((iByte & 0x80) == 0) || ((iByte & 0xE0) == 0xC0) || ((iByte & 0xF0) == 0xE0) || ((iByte & 0xF8) == 0xF0) || ((iByte & 0xFC) == 0xF8);
15}
16
17PL_ALWAYS_INLINE bool plUnicodeUtils::IsUtf8ContinuationByte(char iByte)
18{
19 // check whether the two upper bits are set to '10'
20 return (iByte & 0xC0) == 0x80;
21}
22
23PL_ALWAYS_INLINE bool plUnicodeUtils::IsASCII(plUInt32 uiChar)
24{
25 return (uiChar <= 127);
26}
27
28inline plUInt32 plUnicodeUtils::GetUtf8SequenceLength(char iFirstByte)
29{
30 const plUInt32 uiBit7 = iFirstByte & PL_BIT(7);
31 const plUInt32 uiBit6 = iFirstByte & PL_BIT(6);
32 const plUInt32 uiBit5 = iFirstByte & PL_BIT(5);
33 const plUInt32 uiBit4 = iFirstByte & PL_BIT(4);
34
35 if (uiBit7 == 0) // ASCII character '0xxxxxxx'
36 return 1;
37
38 PL_IGNORE_UNUSED(uiBit6);
39 PL_ASSERT_DEV(uiBit6 != 0, "Invalid Leading UTF-8 Byte.");
40
41 if (uiBit5 == 0) // '110xxxxx'
42 return 2;
43 if (uiBit4 == 0) // '1110xxxx'
44 return 3;
45
46 // '1111xxxx'
47 return 4;
48}
49
50template <typename ByteIterator>
51plUInt32 plUnicodeUtils::DecodeUtf8ToUtf32(ByteIterator& ref_szUtf8Iterator)
52{
53 return utf8::unchecked::next(ref_szUtf8Iterator);
54}
55
56template <typename UInt16Iterator>
57bool plUnicodeUtils::IsUtf16Surrogate(UInt16Iterator& ref_szUtf16Iterator)
58{
59 uint32_t cp = utf8::internal::mask16(*ref_szUtf16Iterator);
60 return utf8::internal::is_lead_surrogate(cp);
61}
62
63template <typename UInt16Iterator>
64plUInt32 plUnicodeUtils::DecodeUtf16ToUtf32(UInt16Iterator& ref_szUtf16Iterator)
65{
66 uint32_t cp = utf8::internal::mask16(*ref_szUtf16Iterator++);
67 if (utf8::internal::is_lead_surrogate(cp))
68 {
69 uint32_t trail_surrogate = utf8::internal::mask16(*ref_szUtf16Iterator++);
70 cp = (cp << 10) + trail_surrogate + utf8::internal::SURROGATE_OFFSET;
71 }
72
73 return cp;
74}
75
76template <typename WCharIterator>
77plUInt32 plUnicodeUtils::DecodeWCharToUtf32(WCharIterator& ref_szWCharIterator)
78{
79 if constexpr (sizeof(wchar_t) == 2)
80 {
81 return DecodeUtf16ToUtf32(ref_szWCharIterator);
82 }
83 else // sizeof(wchar_t) == 4
84 {
85 const plUInt32 uiResult = *ref_szWCharIterator;
86 ++ref_szWCharIterator;
87 return uiResult;
88 }
89}
90
91template <typename ByteIterator>
92void plUnicodeUtils::EncodeUtf32ToUtf8(plUInt32 uiUtf32, ByteIterator& ref_szUtf8Output)
93{
94 ref_szUtf8Output = utf8::unchecked::utf32to8(&uiUtf32, &uiUtf32 + 1, ref_szUtf8Output);
95}
96
97template <typename UInt16Iterator>
98void plUnicodeUtils::EncodeUtf32ToUtf16(plUInt32 uiUtf32, UInt16Iterator& ref_szUtf16Output)
99{
100 if (uiUtf32 > 0xffff)
101 {
102 // make a surrogate pair
103 *ref_szUtf16Output++ = static_cast<uint16_t>((uiUtf32 >> 10) + utf8::internal::LEAD_OFFSET);
104 *ref_szUtf16Output++ = static_cast<uint16_t>((uiUtf32 & 0x3ff) + utf8::internal::TRAIL_SURROGATE_MIN);
105 }
106 else
107 *ref_szUtf16Output++ = static_cast<uint16_t>(uiUtf32);
108}
109
110template <typename WCharIterator>
111void plUnicodeUtils::EncodeUtf32ToWChar(plUInt32 uiUtf32, WCharIterator& ref_szWCharOutput)
112{
113 if constexpr (sizeof(wchar_t) == 2)
114 {
115 EncodeUtf32ToUtf16(uiUtf32, ref_szWCharOutput);
116 }
117 else
118 {
119 *ref_szWCharOutput = static_cast<wchar_t>(uiUtf32);
120 ++ref_szWCharOutput;
121 }
122}
123
124inline plUInt32 plUnicodeUtils::ConvertUtf8ToUtf32(const char* pFirstChar)
125{
126 return utf8::unchecked::peek_next(pFirstChar);
127}
128
129inline plUInt32 plUnicodeUtils::GetSizeForCharacterInUtf8(plUInt32 uiCharacter)
130{
131 // Basically implements this: http://en.wikipedia.org/wiki/Utf8#Description
132
133 if (uiCharacter <= 0x0000007f)
134 return 1;
135
136 if (uiCharacter <= 0x000007ff)
137 return 2;
138
139 if (uiCharacter <= 0x0000ffff)
140 return 3;
141
142 // UTF-8 can use up to 6 bytes to encode a code point
143 // however some committee agreed that never more than 4 bytes are used (no need for more than 21 Bits)
144 // this implementation assumes in several places, that the UTF-8 encoding never uses more than 4 bytes
145
146 PL_ASSERT_DEV(uiCharacter <= 0x0010ffff, "Invalid Unicode Codepoint");
147 return 4;
148}
149
150PL_ALWAYS_INLINE bool plUnicodeUtils::IsValidUtf8(const char* szString, const char* szStringEnd)
151{
152#if PL_ENABLED(PL_USE_STRING_VALIDATION)
153 if (szStringEnd == GetMaxStringEnd<char>())
154 szStringEnd = szString + strlen(szString);
155
156 return utf8::is_valid(szString, szStringEnd);
157#else
158 return true;
159#endif
160}
161
162inline bool plUnicodeUtils::SkipUtf8Bom(const char*& ref_szUtf8)
163{
164 PL_ASSERT_DEBUG(ref_szUtf8 != nullptr, "This function expects non nullptr pointers");
165
166 if (utf8::starts_with_bom(ref_szUtf8, ref_szUtf8 + 4))
167 {
168 ref_szUtf8 += 3;
169 return true;
170 }
171
172 return false;
173}
174
175inline bool plUnicodeUtils::SkipUtf16BomLE(const plUInt16*& ref_pUtf16)
176{
177 PL_ASSERT_DEBUG(ref_pUtf16 != nullptr, "This function expects non nullptr pointers");
178
179 if (*ref_pUtf16 == plUnicodeUtils::Utf16BomLE)
180 {
181 ++ref_pUtf16;
182 return true;
183 }
184
185 return false;
186}
187
188inline bool plUnicodeUtils::SkipUtf16BomBE(const plUInt16*& ref_pUtf16)
189{
190 PL_ASSERT_DEBUG(ref_pUtf16 != nullptr, "This function expects non nullptr pointers");
191
192 if (*ref_pUtf16 == plUnicodeUtils::Utf16BomBE)
193 {
194 ++ref_pUtf16;
195 return true;
196 }
197
198 return false;
199}
200
201inline plResult plUnicodeUtils::MoveToNextUtf8(const char*& ref_szUtf8, plUInt32 uiNumCharacters)
202{
203 PL_ASSERT_DEBUG(ref_szUtf8 != nullptr, "Invalid string pointer to advance!");
204
205 while (uiNumCharacters > 0)
206 {
207 if (*ref_szUtf8 == '\0')
208 return PL_FAILURE;
209
210 do
211 {
212 ++ref_szUtf8;
213 } while (IsUtf8ContinuationByte(*ref_szUtf8));
214
215 --uiNumCharacters;
216 }
217
218 return PL_SUCCESS;
219}
220
221inline plResult plUnicodeUtils::MoveToNextUtf8(const char*& ref_szUtf8, const char* szUtf8End, plUInt32 uiNumCharacters)
222{
223 PL_ASSERT_DEBUG(ref_szUtf8 != nullptr, "Invalid string pointer to advance!");
224
225 while (uiNumCharacters > 0)
226 {
227 if (ref_szUtf8 >= szUtf8End || *ref_szUtf8 == '\0')
228 return PL_FAILURE;
229
230 do
231 {
232 ++ref_szUtf8;
233 } while ((ref_szUtf8 < szUtf8End) && IsUtf8ContinuationByte(*ref_szUtf8));
234
235 --uiNumCharacters;
236 }
237
238 return PL_SUCCESS;
239}
240
241inline plResult plUnicodeUtils::MoveToPriorUtf8(const char*& ref_szUtf8, const char* szUtf8Start, plUInt32 uiNumCharacters)
242{
243 PL_ASSERT_DEBUG(ref_szUtf8 != nullptr, "Invalid string pointer to advance!");
244
245 while (uiNumCharacters > 0)
246 {
247 if (ref_szUtf8 <= szUtf8Start)
248 return PL_FAILURE;
249
250 do
251 {
252 --ref_szUtf8;
253 } while (IsUtf8ContinuationByte(*ref_szUtf8));
254
255 --uiNumCharacters;
256 }
257
258 return PL_SUCCESS;
259}
260template <typename T>
262{
263 return reinterpret_cast<T*>(-1);
264}
static bool IsUtf16Surrogate(UInt16Iterator &ref_szUtf16Iterator)
Characters that cannot be represented in a single utf16 code point need to be split up into two surro...
Definition UnicodeUtils_inl.h:57
static plUInt32 DecodeUtf8ToUtf32(ByteIterator &ref_szUtf8Iterator)
Decodes the next character from the given Utf8 sequence to Utf32 and increments the iterator as far a...
Definition UnicodeUtils_inl.h:51
static plUInt32 DecodeWCharToUtf32(WCharIterator &ref_szWCharIterator)
Decodes the next character from the given wchar_t sequence to Utf32 and increments the iterator as fa...
Definition UnicodeUtils_inl.h:77
static bool SkipUtf8Bom(const char *&ref_szUtf8)
If the given string starts with a Utf8 Bom, the pointer is incremented behind the Bom,...
Definition UnicodeUtils_inl.h:162
static plUInt32 DecodeUtf16ToUtf32(UInt16Iterator &ref_szUtf16Iterator)
Decodes the next character from the given Utf16 sequence to Utf32 and increments the iterator as far ...
Definition UnicodeUtils_inl.h:64
static constexpr plUInt16 Utf16BomBE
Byte Order Mark for Big Endian Utf16 strings.
Definition UnicodeUtils.h:22
static constexpr T * GetMaxStringEnd()
[internal] Returns the max string end pointer for the given type
Definition UnicodeUtils_inl.h:261
static plUInt32 ConvertUtf8ToUtf32(const char *pFirstChar)
Converts the UTF-8 character that starts at pFirstChar into a UTF-32 character.
Definition UnicodeUtils_inl.h:124
static bool IsASCII(plUInt32 uiChar)
Returns whether a character is a pure ASCII character (only the first 7 Bits are used)
Definition UnicodeUtils_inl.h:23
static void EncodeUtf32ToUtf16(plUInt32 uiUtf32, UInt16Iterator &ref_szUtf16Output)
Encodes the given Utf32 character to Utf16 and writes as many bytes to the output iterator,...
Definition UnicodeUtils_inl.h:98
static bool SkipUtf16BomLE(const plUInt16 *&ref_pUtf16)
If the given string starts with a Utf16 little endian Bom, the pointer is incremented behind the Bom,...
Definition UnicodeUtils_inl.h:175
static constexpr plUInt16 Utf16BomLE
Byte Order Mark for Little Endian Utf16 strings.
Definition UnicodeUtils.h:19
static void EncodeUtf32ToWChar(plUInt32 uiUtf32, WCharIterator &ref_szWCharOutput)
Encodes the given Utf32 character to wchar_t and writes as many bytes to the output iterator,...
Definition UnicodeUtils_inl.h:111
static bool SkipUtf16BomBE(const plUInt16 *&ref_pUtf16)
If the given string starts with a Utf16 big endian Bom, the pointer is incremented behind the Bom,...
Definition UnicodeUtils_inl.h:188
static plResult MoveToNextUtf8(const char *&ref_szUtf8, plUInt32 uiNumCharacters=1)
Moves the given string pointer ahead to the next Utf8 character sequence.
Definition UnicodeUtils_inl.h:201
static void EncodeUtf32ToUtf8(plUInt32 uiUtf32, ByteIterator &ref_szUtf8Output)
Encodes the given Utf32 character to Utf8 and writes as many bytes to the output iterator,...
Definition UnicodeUtils_inl.h:92
static plUInt32 GetSizeForCharacterInUtf8(plUInt32 uiCharacter)
Computes how many bytes the character would require, if encoded in UTF-8.
Definition UnicodeUtils_inl.h:129
static plUInt32 GetUtf8SequenceLength(char iFirstByte)
Returns the number of bytes that a UTF-8 sequence is in length, which is encoded in the first byte of...
Definition UnicodeUtils_inl.h:28
static bool IsUtf8ContinuationByte(char iByte)
Checks whether the given byte is a byte in a UTF-8 multi-byte sequence.
Definition UnicodeUtils_inl.h:17
static bool IsValidUtf8(const char *szString, const char *szStringEnd=GetMaxStringEnd< char >())
Returns false if the given string does not contain a completely valid Utf8 string.
Definition UnicodeUtils_inl.h:150
static plResult MoveToPriorUtf8(const char *&ref_szUtf8, const char *szUtf8Start, plUInt32 uiNumCharacters=1)
Moves the given string pointer backwards to the previous Utf8 character sequence.
Definition UnicodeUtils_inl.h:241
static bool IsUtf8StartByte(char iByte)
Checks whether the given byte is a start byte in a UTF-8 multi-byte sequence.
Definition UnicodeUtils_inl.h:11
Default enum for returning failure or success, instead of using a bool.
Definition Types.h:54