Changeset f444633 in mainline for common/stdc/uchar.c
- Timestamp:
- 2025-04-15T22:42:20Z (3 days ago)
- Children:
- 1d3ae66
- Parents:
- b6061f8c (diff), 65bf084 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - git-author:
- Wayne Thornton <wmthornton-dev@…> (2025-04-15 22:42:20)
- git-committer:
- GitHub <noreply@…> (2025-04-15 22:42:20)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/stdc/uchar.c
rb6061f8c rf444633 59 59 } 60 60 61 static bool _is_continuation(uint8_t c)62 {63 return (c & 0xC0) == 0x80;64 }65 66 static bool _is_1_byte(uint8_t c)67 {68 return (c & 0x80) == 0;69 }70 71 static bool _is_2_byte(uint8_t c)72 {73 return (c & 0xE0) == 0xC0;74 }75 76 static bool _is_3_byte(uint8_t c)77 {78 return (c & 0xF0) == 0xE0;79 }80 81 static bool _is_4_byte(uint8_t c)82 {83 return (c & 0xF8) == 0xF0;84 }85 86 static bool _is_non_shortest(unsigned short cont, uint8_t b)87 {88 return (cont == 0b1111110000000000 && !(b & 0b00100000)) ||89 (cont == 0b1111111111110000 && !(b & 0b00110000));90 }91 92 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)93 {94 #if __STDC_HOSTED__95 static fibril_local mbstate_t global_state = { };96 97 if (!mb)98 mb = &global_state;99 #else100 assert(mb);101 #endif102 103 if (n == 0)104 return UCHAR_INCOMPLETE;105 106 char32_t dummy;107 108 if (!c)109 c = &dummy;110 111 if (!s) {112 // Equivalent to mbrtoc32(NULL, "", 1, mb).113 if (mb->continuation) {114 _set_ilseq();115 return UCHAR_ILSEQ;116 } else {117 return 0;118 }119 }120 121 size_t i = 0;122 123 if (!mb->continuation) {124 /* Clean slate, read initial byte. */125 126 uint8_t b = s[i++];127 128 if (_is_1_byte(b)) {129 *c = b;130 return b == 0 ? 0 : 1;131 }132 133 if (_is_continuation(b)) {134 /* unexpected continuation byte */135 _set_ilseq();136 return UCHAR_ILSEQ;137 }138 139 /*140 * The value stored into `continuation` is designed to have141 * just enough leading ones that after shifting in one less than142 * the expected number of continuation bytes, the most significant143 * bit becomes zero. (The field is 16b wide.)144 */145 146 if (_is_2_byte(b)) {147 /* Reject non-shortest form. */148 if (!(b & 0b00011110)) {149 _set_ilseq();150 return UCHAR_ILSEQ;151 }152 153 /* 2 byte encoding 110xxxxx */154 mb->continuation = b ^ 0b0000000011000000;155 156 } else if (_is_3_byte(b)) {157 /* 3 byte encoding 1110xxxx */158 mb->continuation = b ^ 0b1111110011100000;159 160 } else if (_is_4_byte(b)) {161 /* 4 byte encoding 11110xxx */162 mb->continuation = b ^ 0b1111111100000000;163 }164 }165 166 for (; i < n; i++) {167 /* Read continuation bytes. */168 uint8_t b = s[i];169 170 if (!_is_continuation(b) || _is_non_shortest(mb->continuation, b)) {171 _set_ilseq();172 return UCHAR_ILSEQ;173 }174 175 /* Top bit becomes zero just before the last byte is shifted in. */176 if (!(mb->continuation & 0x8000)) {177 *c = ((char32_t) mb->continuation) << 6 | (b & 0x3f);178 mb->continuation = 0;179 return ++i;180 }181 182 mb->continuation = mb->continuation << 6 | (b & 0x3f);183 }184 185 return UCHAR_INCOMPLETE;186 }187 188 61 #define UTF8_CONT(c, shift) (0x80 | (((c) >> (shift)) & 0x3F)) 189 62 … … 253 126 if (!s) { 254 127 /* Equivalent to mbrtoc16(NULL, "", 1, mb). */ 255 if (mb-> continuation) {128 if (mb->state) { 256 129 _set_ilseq(); 257 130 return UCHAR_ILSEQ; … … 261 134 } 262 135 263 if ((mb-> continuation& 0xD000) == 0xD000) {136 if ((mb->state & 0xD000) == 0xD000) { 264 137 /* mbstate_t contains the second surrogate character. */ 265 138 /* mbrtoc32() will never set it to such value. */ 266 *c = mb-> continuation;267 mb-> continuation= 0;139 *c = mb->state; 140 mb->state = 0; 268 141 return UCHAR_CONTINUED; 269 142 } … … 276 149 } else { 277 150 /* Encode UTF-16 surrogates. */ 278 mb-> continuation= (c32 & 0x3FF) + 0xDC00;151 mb->state = (c32 & 0x3FF) + 0xDC00; 279 152 *c = (c32 >> 10) + 0xD7C0; 280 153 } … … 298 171 if (!s) { 299 172 // Equivalent to c16rtomb(buf, L’\0’, mb). 300 if (mb-> continuation) {173 if (mb->state) { 301 174 _set_ilseq(); 302 175 return UCHAR_ILSEQ; … … 307 180 308 181 if (!_is_surrogate(c)) { 309 if (mb-> continuation) {182 if (mb->state) { 310 183 _set_ilseq(); 311 184 return UCHAR_ILSEQ; … … 315 188 } 316 189 317 if (!mb-> continuation) {318 mb-> continuation= c;190 if (!mb->state) { 191 mb->state = c; 319 192 return 0; 320 193 } … … 323 196 324 197 /* Decode UTF-16 surrogates. */ 325 if (_is_low_surrogate(mb-> continuation) && _is_high_surrogate(c)) {326 c32 = ((c - 0xD7C0) << 10) | (mb-> continuation- 0xDC00);327 } else if (_is_high_surrogate(mb-> continuation) && _is_low_surrogate(c)) {328 c32 = ((mb-> continuation- 0xD7C0) << 10) | (c - 0xDC00);198 if (_is_low_surrogate(mb->state) && _is_high_surrogate(c)) { 199 c32 = ((c - 0xD7C0) << 10) | (mb->state - 0xDC00); 200 } else if (_is_high_surrogate(mb->state) && _is_low_surrogate(c)) { 201 c32 = ((mb->state - 0xD7C0) << 10) | (c - 0xDC00); 329 202 } else { 330 203 _set_ilseq(); … … 332 205 } 333 206 334 mb-> continuation= 0;207 mb->state = 0; 335 208 return c32rtomb(s, c32, mb); 336 209 }
Note:
See TracChangeset
for help on using the changeset viewer.