Changeset b31323f in mainline for common/str.c
- Timestamp:
- 2025-04-17T14:29:23Z (5 days ago)
- Branches:
- master
- Children:
- ae787807
- Parents:
- 65bf084
- git-author:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-17 11:01:00)
- git-committer:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-17 14:29:23)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/str.c
r65bf084 rb31323f 234 234 } 235 235 236 static bool _is_surrogate(const mbstate_t *mb, uint8_t b) 237 { 238 return (mb->state == 0b1111110000001101 && b >= 0xa0); 239 } 240 236 241 #define _likely(expr) __builtin_expect((expr), true) 237 242 #define _unlikely(expr) __builtin_expect((expr), false) … … 299 304 return CHAR_INVALID; 300 305 306 /* Reject surrogates */ 307 if (_unlikely(ch >= 0xD800 && ch < 0xE000)) 308 return CHAR_INVALID; 309 301 310 return ch; 302 311 } … … 323 332 return CHAR_INVALID; 324 333 334 /* Reject out-of-range characters. */ 335 if (_unlikely(ch >= 0x110000)) 336 return CHAR_INVALID; 337 325 338 return ch; 326 339 } … … 339 352 uint8_t b = s[*offset]; 340 353 341 if (!_is_continuation(b) || _is_non_shortest(mb, b) ) {354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) { 342 355 mb->state = 0; 343 356 return CHAR_INVALID; … … 523 536 } 524 537 525 /* Convert in place any bytes that don't form a valid character into U_SPECIAL. */526 static void _sanitize_string(char *str, size_t n)538 /* Convert in place any bytes that don't form a valid character into replacement. */ 539 static size_t _str_sanitize(char *str, size_t n, uint8_t replacement) 527 540 { 528 541 uint8_t *b = (uint8_t *) str; 529 530 for (; *b && n > 0; b++, n--) { 542 size_t count = 0; 543 544 for (; n > 0 && b[0]; b++, n--) { 531 545 int cont = _continuation_bytes(b[0]); 532 546 if (__builtin_expect(cont, 0) == 0) … … 534 548 535 549 if (cont < 0 || n <= (size_t) cont) { 536 b[0] = U_SPECIAL; 550 b[0] = replacement; 551 count++; 537 552 continue; 538 553 } 539 554 540 555 /* Check continuation bytes. */ 556 bool valid = true; 541 557 for (int i = 1; i <= cont; i++) { 542 558 if (!_is_continuation(b[i])) { 543 b[0] = U_SPECIAL;544 continue;559 valid = false; 560 break; 545 561 } 562 } 563 564 if (!valid) { 565 b[0] = replacement; 566 count++; 567 continue; 546 568 } 547 569 … … 551 573 */ 552 574 553 switch (cont) { 554 case 1: 555 /* 0b110!!!!x 0b10xxxxxx */ 556 if (!(b[0] & 0b00011110)) 557 b[0] = U_SPECIAL; 558 559 continue; 560 case 2: 561 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */ 562 if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000)) 563 b[0] = U_SPECIAL; 564 565 continue; 566 case 3: 567 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */ 568 if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000)) 569 b[0] = U_SPECIAL; 570 575 /* 0b110!!!!x 0b10xxxxxx */ 576 if (cont == 1 && !(b[0] & 0b00011110)) { 577 b[0] = replacement; 578 count++; 571 579 continue; 572 580 } 573 } 581 582 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */ 583 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) { 584 b[0] = replacement; 585 count++; 586 continue; 587 } 588 589 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */ 590 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) { 591 b[0] = replacement; 592 count++; 593 continue; 594 } 595 596 /* Check for surrogate character encoding. */ 597 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) { 598 b[0] = replacement; 599 count++; 600 continue; 601 } 602 603 /* Check for out-of-range code points. */ 604 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) { 605 b[0] = replacement; 606 count++; 607 continue; 608 } 609 610 b += cont; 611 n -= cont; 612 } 613 614 return count; 615 } 616 617 size_t str_sanitize(char *str, size_t n, uint8_t replacement) 618 { 619 return _str_sanitize(str, n, replacement); 574 620 } 575 621 … … 1130 1176 1131 1177 /* In-place translate invalid bytes to U_SPECIAL. */ 1132 _s anitize_string(dest, size);1178 _str_sanitize(dest, size, U_SPECIAL); 1133 1179 } 1134 1180 … … 1159 1205 1160 1206 /* In-place translate invalid bytes to U_SPECIAL. */ 1161 _s anitize_string(dest, size);1207 _str_sanitize(dest, size, U_SPECIAL); 1162 1208 } 1163 1209 … … 1183 1229 if (dstr_size < size) { 1184 1230 _str_cpyn(dest + dstr_size, size - dstr_size, src); 1185 _s anitize_string(dest + dstr_size, size - dstr_size);1231 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL); 1186 1232 } 1187 1233 } … … 1762 1808 1763 1809 memcpy(dest, src, size); 1764 _s anitize_string(dest, size);1810 _str_sanitize(dest, size, U_SPECIAL); 1765 1811 return dest; 1766 1812 } … … 1795 1841 1796 1842 memcpy(dest, src, size); 1797 _s anitize_string(dest, size);1843 _str_sanitize(dest, size, U_SPECIAL); 1798 1844 dest[size] = 0; 1799 1845 return dest;
Note:
See TracChangeset
for help on using the changeset viewer.