Changeset f444633 in mainline
- Timestamp:
- 2025-04-15T22:42:20Z (15 hours ago)
- Children:
- bfee444
- Parents:
- b6061f8c (diff), 65bf084 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - git-author:
- Wayne Thornton <wmthornton-dev@…> (2025-04-15 22:42:20)
- git-committer:
- GitHub <noreply@…> (2025-04-15 22:42:20)
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
abi/include/_bits/mbstate_t.h
rb6061f8c rf444633 37 37 38 38 typedef struct { 39 unsigned short continuation;39 unsigned short state; 40 40 } mbstate_t; 41 41 -
common/stdc/uchar.c
rb6061f8c rf444633 59 59 } 60 60 61 static bool _is_continuation(uint8_t c)62 {63 return (c & 0xC0) == 0x80;64 }65 66 static bool _is_1_byte(uint8_t c)67 {68 return (c & 0x80) == 0;69 }70 71 static bool _is_2_byte(uint8_t c)72 {73 return (c & 0xE0) == 0xC0;74 }75 76 static bool _is_3_byte(uint8_t c)77 {78 return (c & 0xF0) == 0xE0;79 }80 81 static bool _is_4_byte(uint8_t c)82 {83 return (c & 0xF8) == 0xF0;84 }85 86 static bool _is_non_shortest(unsigned short cont, uint8_t b)87 {88 return (cont == 0b1111110000000000 && !(b & 0b00100000)) ||89 (cont == 0b1111111111110000 && !(b & 0b00110000));90 }91 92 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)93 {94 #if __STDC_HOSTED__95 static fibril_local mbstate_t global_state = { };96 97 if (!mb)98 mb = &global_state;99 #else100 assert(mb);101 #endif102 103 if (n == 0)104 return UCHAR_INCOMPLETE;105 106 char32_t dummy;107 108 if (!c)109 c = &dummy;110 111 if (!s) {112 // Equivalent to mbrtoc32(NULL, "", 1, mb).113 if (mb->continuation) {114 _set_ilseq();115 return UCHAR_ILSEQ;116 } else {117 return 0;118 }119 }120 121 size_t i = 0;122 123 if (!mb->continuation) {124 /* Clean slate, read initial byte. */125 126 uint8_t b = s[i++];127 128 if (_is_1_byte(b)) {129 *c = b;130 return b == 0 ? 0 : 1;131 }132 133 if (_is_continuation(b)) {134 /* unexpected continuation byte */135 _set_ilseq();136 return UCHAR_ILSEQ;137 }138 139 /*140 * The value stored into `continuation` is designed to have141 * just enough leading ones that after shifting in one less than142 * the expected number of continuation bytes, the most significant143 * bit becomes zero. (The field is 16b wide.)144 */145 146 if (_is_2_byte(b)) {147 /* Reject non-shortest form. */148 if (!(b & 0b00011110)) {149 _set_ilseq();150 return UCHAR_ILSEQ;151 }152 153 /* 2 byte encoding 110xxxxx */154 mb->continuation = b ^ 0b0000000011000000;155 156 } else if (_is_3_byte(b)) {157 /* 3 byte encoding 1110xxxx */158 mb->continuation = b ^ 0b1111110011100000;159 160 } else if (_is_4_byte(b)) {161 /* 4 byte encoding 11110xxx */162 mb->continuation = b ^ 0b1111111100000000;163 }164 }165 166 for (; i < n; i++) {167 /* Read continuation bytes. */168 uint8_t b = s[i];169 170 if (!_is_continuation(b) || _is_non_shortest(mb->continuation, b)) {171 _set_ilseq();172 return UCHAR_ILSEQ;173 }174 175 /* Top bit becomes zero just before the last byte is shifted in. */176 if (!(mb->continuation & 0x8000)) {177 *c = ((char32_t) mb->continuation) << 6 | (b & 0x3f);178 mb->continuation = 0;179 return ++i;180 }181 182 mb->continuation = mb->continuation << 6 | (b & 0x3f);183 }184 185 return UCHAR_INCOMPLETE;186 }187 188 61 #define UTF8_CONT(c, shift) (0x80 | (((c) >> (shift)) & 0x3F)) 189 62 … … 253 126 if (!s) { 254 127 /* Equivalent to mbrtoc16(NULL, "", 1, mb). */ 255 if (mb-> continuation) {128 if (mb->state) { 256 129 _set_ilseq(); 257 130 return UCHAR_ILSEQ; … … 261 134 } 262 135 263 if ((mb-> continuation& 0xD000) == 0xD000) {136 if ((mb->state & 0xD000) == 0xD000) { 264 137 /* mbstate_t contains the second surrogate character. */ 265 138 /* mbrtoc32() will never set it to such value. */ 266 *c = mb-> continuation;267 mb-> continuation= 0;139 *c = mb->state; 140 mb->state = 0; 268 141 return UCHAR_CONTINUED; 269 142 } … … 276 149 } else { 277 150 /* Encode UTF-16 surrogates. */ 278 mb-> continuation= (c32 & 0x3FF) + 0xDC00;151 mb->state = (c32 & 0x3FF) + 0xDC00; 279 152 *c = (c32 >> 10) + 0xD7C0; 280 153 } … … 298 171 if (!s) { 299 172 // Equivalent to c16rtomb(buf, L’\0’, mb). 300 if (mb-> continuation) {173 if (mb->state) { 301 174 _set_ilseq(); 302 175 return UCHAR_ILSEQ; … … 307 180 308 181 if (!_is_surrogate(c)) { 309 if (mb-> continuation) {182 if (mb->state) { 310 183 _set_ilseq(); 311 184 return UCHAR_ILSEQ; … … 315 188 } 316 189 317 if (!mb-> continuation) {318 mb-> continuation= c;190 if (!mb->state) { 191 mb->state = c; 319 192 return 0; 320 193 } … … 323 196 324 197 /* Decode UTF-16 surrogates. */ 325 if (_is_low_surrogate(mb-> continuation) && _is_high_surrogate(c)) {326 c32 = ((c - 0xD7C0) << 10) | (mb-> continuation- 0xDC00);327 } else if (_is_high_surrogate(mb-> continuation) && _is_low_surrogate(c)) {328 c32 = ((mb-> continuation- 0xD7C0) << 10) | (c - 0xDC00);198 if (_is_low_surrogate(mb->state) && _is_high_surrogate(c)) { 199 c32 = ((c - 0xD7C0) << 10) | (mb->state - 0xDC00); 200 } else if (_is_high_surrogate(mb->state) && _is_low_surrogate(c)) { 201 c32 = ((mb->state - 0xD7C0) << 10) | (c - 0xDC00); 329 202 } else { 330 203 _set_ilseq(); … … 332 205 } 333 206 334 mb-> continuation= 0;207 mb->state = 0; 335 208 return c32rtomb(s, c32, mb); 336 209 } -
common/stdc/wchar.c
rb6061f8c rf444633 46 46 int mbsinit(const mbstate_t *ps) 47 47 { 48 return ps == NULL || ps-> continuation== 0;48 return ps == NULL || ps->state == 0; 49 49 } 50 50 -
common/str.c
rb6061f8c rf444633 5 5 * Copyright (c) 2011 Martin Sucha 6 6 * Copyright (c) 2011 Oleg Romanenko 7 * Copyright (c) 2025 Jiří Zárevúcky 7 8 * All rights reserved. 8 9 * … … 124 125 #include <ctype.h> 125 126 #include <errno.h> 127 #include <limits.h> 126 128 #include <macros.h> 127 129 #include <mem.h> … … 132 134 #include <uchar.h> 133 135 136 #if __STDC_HOSTED__ 137 #include <fibril.h> 138 #endif 139 140 static void _set_ilseq() 141 { 142 #ifdef errno 143 errno = EILSEQ; 144 #endif 145 } 146 134 147 /** Byte mask consisting of lowest @n bits (out of 8) */ 135 148 #define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1)) … … 144 157 #define CONT_BITS 6 145 158 159 #define UTF8_MASK_INITIAL2 0b00011111 160 #define UTF8_MASK_INITIAL3 0b00001111 161 #define UTF8_MASK_INITIAL4 0b00000111 162 #define UTF8_MASK_CONT 0b00111111 163 164 #define CHAR_INVALID ((char32_t) UINT_MAX) 165 146 166 static inline bool _is_ascii(uint8_t b) 147 167 { … … 149 169 } 150 170 151 static inline bool _is_continuation_byte(uint8_t b) 152 { 153 return (b & 0xc0) == 0x80; 171 static inline bool _is_continuation(uint8_t b) 172 { 173 return (b & 0xC0) == 0x80; 174 } 175 176 static inline bool _is_2_byte(uint8_t c) 177 { 178 return (c & 0xE0) == 0xC0; 179 } 180 181 static inline bool _is_3_byte(uint8_t c) 182 { 183 return (c & 0xF0) == 0xE0; 184 } 185 186 static inline bool _is_4_byte(uint8_t c) 187 { 188 return (c & 0xF8) == 0xF0; 154 189 } 155 190 … … 179 214 180 215 /* 110xxxxx 10xxxxxx */ 181 if ( (b & 0xe0) == 0xc0)216 if (_is_2_byte(b)) 182 217 return 1; 183 218 184 219 /* 1110xxxx 10xxxxxx 10xxxxxx */ 185 if ( (b & 0xf0) == 0xe0)220 if (_is_3_byte(b)) 186 221 return 2; 187 222 188 223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 189 if ( (b & 0xf8) == 0xf0)224 if (_is_4_byte(b)) 190 225 return 3; 191 226 192 227 return -1; 228 } 229 230 static bool _is_non_shortest(const mbstate_t *mb, uint8_t b) 231 { 232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) || 233 (mb->state == 0b1111111111110000 && !(b & 0b00110000)); 234 } 235 236 #define _likely(expr) __builtin_expect((expr), true) 237 #define _unlikely(expr) __builtin_expect((expr), false) 238 239 #define FAST_PATHS 1 240 241 static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb) 242 { 243 assert(s); 244 assert(offset); 245 assert(*offset <= size); 246 assert(size == STR_NO_LIMIT || s + size >= s); 247 assert(mb); 248 249 if (*offset == size) 250 return 0; 251 252 if (_likely(!mb->state)) { 253 /* Clean slate, read initial byte. */ 254 uint8_t b = s[(*offset)++]; 255 256 /* Fast exit for the most common case. */ 257 if (_likely(_is_ascii(b))) 258 return b; 259 260 /* unexpected continuation byte */ 261 if (_unlikely(_is_continuation(b))) 262 return CHAR_INVALID; 263 264 /* 265 * The value stored into `continuation` is designed to have 266 * just enough leading ones that after shifting in one less than 267 * the expected number of continuation bytes, the most significant 268 * bit becomes zero. (The field is 16b wide.) 269 */ 270 271 if (_is_2_byte(b)) { 272 /* Reject non-shortest form. */ 273 if (_unlikely(!(b & 0b00011110))) 274 return CHAR_INVALID; 275 276 #if FAST_PATHS 277 /* We can usually take this exit. */ 278 if (_likely(*offset < size && _is_continuation(s[*offset]))) 279 return (b & UTF8_MASK_INITIAL2) << 6 | 280 (s[(*offset)++] & UTF8_MASK_CONT); 281 #endif 282 283 /* 2 byte continuation 110xxxxx */ 284 mb->state = b ^ 0b0000000011000000; 285 286 } else if (_is_3_byte(b)) { 287 #if FAST_PATHS 288 /* We can usually take this exit. */ 289 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) { 290 291 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 | 292 (s[(*offset)] & UTF8_MASK_CONT) << 6 | 293 (s[(*offset) + 1] & UTF8_MASK_CONT); 294 295 *offset += 2; 296 297 /* Reject non-shortest form. */ 298 if (_unlikely(!(ch & 0xFFFFF800))) 299 return CHAR_INVALID; 300 301 return ch; 302 } 303 #endif 304 305 /* 3 byte continuation 1110xxxx */ 306 mb->state = b ^ 0b1111110011100000; 307 308 } else if (_is_4_byte(b)) { 309 #if FAST_PATHS 310 /* We can usually take this exit. */ 311 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) && 312 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) { 313 314 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 | 315 (s[(*offset)] & UTF8_MASK_CONT) << 12 | 316 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 | 317 (s[(*offset) + 2] & UTF8_MASK_CONT); 318 319 *offset += 3; 320 321 /* Reject non-shortest form. */ 322 if (_unlikely(!(ch & 0xFFFF0000))) 323 return CHAR_INVALID; 324 325 return ch; 326 } 327 #endif 328 329 /* 4 byte continuation 11110xxx */ 330 mb->state = b ^ 0b1111111100000000; 331 } else { 332 return CHAR_INVALID; 333 } 334 } 335 336 /* Deal with the remaining edge and invalid cases. */ 337 for (; *offset < size; (*offset)++) { 338 /* Read continuation bytes. */ 339 uint8_t b = s[*offset]; 340 341 if (!_is_continuation(b) || _is_non_shortest(mb, b)) { 342 mb->state = 0; 343 return CHAR_INVALID; 344 } 345 346 /* Top bit becomes zero when shifting in the second to last byte. */ 347 if (!(mb->state & 0x8000)) { 348 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT); 349 mb->state = 0; 350 (*offset)++; 351 return c; 352 } 353 354 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT); 355 } 356 357 /* Incomplete character. */ 358 assert(mb->state); 359 return 0; 360 } 361 362 /** Standard <uchar.h> function since C11. */ 363 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb) 364 { 365 #if __STDC_HOSTED__ 366 static fibril_local mbstate_t global_state = { }; 367 368 if (!mb) 369 mb = &global_state; 370 #endif 371 372 if (!s) { 373 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */ 374 c = NULL; 375 s = ""; 376 n = 1; 377 } 378 379 size_t offset = 0; 380 char32_t ret = _str_decode(s, &offset, n, mb); 381 if (ret == CHAR_INVALID) { 382 assert(!mb->state); 383 _set_ilseq(); 384 return UCHAR_ILSEQ; 385 } 386 if (mb->state) { 387 assert(ret == 0); 388 return UCHAR_INCOMPLETE; 389 } 390 391 if (c) 392 *c = ret; 393 return ret ? offset : 0; 193 394 } 194 395 … … 210 411 char32_t str_decode(const char *str, size_t *offset, size_t size) 211 412 { 212 if (*offset >= size) 213 return 0; 214 215 /* First byte read from string */ 216 uint8_t b0 = (uint8_t) str[(*offset)++]; 217 218 /* Fast exit for the most common case. */ 219 if (_is_ascii(b0)) 220 return b0; 221 222 /* 10xxxxxx -- unexpected continuation byte */ 223 if (_is_continuation_byte(b0)) 413 mbstate_t mb = { }; 414 char32_t ch = _str_decode(str, offset, size, &mb); 415 416 if (ch == CHAR_INVALID) 224 417 return U_SPECIAL; 225 418 226 /* Determine code length */ 227 228 int cbytes = _continuation_bytes(b0); 229 int b0_bits = 6 - cbytes; /* Data bits in first byte */ 230 231 if (cbytes < 0 || *offset + cbytes > size) 232 return U_SPECIAL; 233 234 char32_t ch = b0 & LO_MASK_8(b0_bits); 235 236 /* Decode continuation bytes */ 237 for (int i = 0; i < cbytes; i++) { 238 uint8_t b = (uint8_t) str[*offset]; 239 240 if (!_is_continuation_byte(b)) 241 return U_SPECIAL; 242 243 (*offset)++; 244 245 /* Shift data bits to ch */ 246 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS)); 247 } 248 249 /* 250 * Reject non-shortest form encodings. 251 * See https://www.unicode.org/versions/corrigendum1.html 252 */ 253 if (cbytes != _char_continuation_bytes(ch)) 419 if (mb.state) 254 420 return U_SPECIAL; 255 421 … … 282 448 uint8_t b = (uint8_t) str[--(*offset)]; 283 449 284 if (_is_continuation _byte(b)) {450 if (_is_continuation(b)) { 285 451 cbytes++; 286 452 continue; 287 453 } 288 454 289 /* Invalid byte. */455 /* Reject non-shortest form encoding. */ 290 456 if (cbytes != _continuation_bytes(b)) 291 457 return U_SPECIAL; … … 317 483 errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size) 318 484 { 485 // TODO: merge with c32rtomb() 486 319 487 if (*offset >= size) 320 488 return EOVERFLOW; … … 372 540 /* Check continuation bytes. */ 373 541 for (int i = 1; i <= cont; i++) { 374 if (!_is_continuation _byte(b[i])) {542 if (!_is_continuation(b[i])) { 375 543 b[0] = U_SPECIAL; 376 544 continue;
Note:
See TracChangeset
for help on using the changeset viewer.