Changes in common/str.c [fdfb24e:65bf084] in mainline
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/str.c
rfdfb24e r65bf084 5 5 * Copyright (c) 2011 Martin Sucha 6 6 * Copyright (c) 2011 Oleg Romanenko 7 * Copyright (c) 2025 Jiří Zárevúcky 7 8 * All rights reserved. 8 9 * … … 54 55 * are valid 55 56 * 57 * Note that Unicode characters do not match 58 * one-to-one with displayed characters or glyphs on 59 * screen. For that level of precision, look up 60 * Grapheme Clusters. 61 * 56 62 * ASCII character 7 bit encoded ASCII character, stored in char 57 63 * (usually signed 8 bit integer), code points 0 .. 127 … … 71 77 * [wide] string width number of display cells on a monospace display taken 72 78 * by a [wide] string, size_t 79 * 80 * This is virtually impossible to determine exactly for 81 * all strings without knowing specifics of the display 82 * device, due to various factors affecting text output. 83 * If you have the option to query the terminal for 84 * position change caused by outputting the string, 85 * it is preferrable to determine width that way. 73 86 * 74 87 * … … 108 121 #include <str.h> 109 122 123 #include <align.h> 110 124 #include <assert.h> 111 125 #include <ctype.h> 112 126 #include <errno.h> 127 #include <limits.h> 128 #include <macros.h> 129 #include <mem.h> 113 130 #include <stdbool.h> 114 131 #include <stddef.h> 115 132 #include <stdint.h> 116 133 #include <stdlib.h> 117 118 #include <align.h> 119 #include <mem.h> 134 #include <uchar.h> 135 136 #if __STDC_HOSTED__ 137 #include <fibril.h> 138 #endif 139 140 static void _set_ilseq() 141 { 142 #ifdef errno 143 errno = EILSEQ; 144 #endif 145 } 120 146 121 147 /** Byte mask consisting of lowest @n bits (out of 8) */ … … 130 156 /** Number of data bits in a UTF-8 continuation byte */ 131 157 #define CONT_BITS 6 158 159 #define UTF8_MASK_INITIAL2 0b00011111 160 #define UTF8_MASK_INITIAL3 0b00001111 161 #define UTF8_MASK_INITIAL4 0b00000111 162 #define UTF8_MASK_CONT 0b00111111 163 164 #define CHAR_INVALID ((char32_t) UINT_MAX) 165 166 static inline bool _is_ascii(uint8_t b) 167 { 168 return b < 0x80; 169 } 170 171 static inline bool _is_continuation(uint8_t b) 172 { 173 return (b & 0xC0) == 0x80; 174 } 175 176 static inline bool _is_2_byte(uint8_t c) 177 { 178 return (c & 0xE0) == 0xC0; 179 } 180 181 static inline bool _is_3_byte(uint8_t c) 182 { 183 return (c & 0xF0) == 0xE0; 184 } 185 186 static inline bool _is_4_byte(uint8_t c) 187 { 188 return (c & 0xF8) == 0xF0; 189 } 190 191 static inline int _char_continuation_bytes(char32_t c) 192 { 193 if ((c & ~LO_MASK_32(7)) == 0) 194 return 0; 195 196 if ((c & ~LO_MASK_32(11)) == 0) 197 return 1; 198 199 if ((c & ~LO_MASK_32(16)) == 0) 200 return 2; 201 202 if ((c & ~LO_MASK_32(21)) == 0) 203 return 3; 204 205 /* Codes longer than 21 bits are not supported */ 206 return -1; 207 } 208 209 static inline int _continuation_bytes(uint8_t b) 210 { 211 /* 0xxxxxxx */ 212 if (_is_ascii(b)) 213 return 0; 214 215 /* 110xxxxx 10xxxxxx */ 216 if (_is_2_byte(b)) 217 return 1; 218 219 /* 1110xxxx 10xxxxxx 10xxxxxx */ 220 if (_is_3_byte(b)) 221 return 2; 222 223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 224 if (_is_4_byte(b)) 225 return 3; 226 227 return -1; 228 } 229 230 static bool _is_non_shortest(const mbstate_t *mb, uint8_t b) 231 { 232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) || 233 (mb->state == 0b1111111111110000 && !(b & 0b00110000)); 234 } 235 236 #define _likely(expr) __builtin_expect((expr), true) 237 #define _unlikely(expr) __builtin_expect((expr), false) 238 239 #define FAST_PATHS 1 240 241 static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb) 242 { 243 assert(s); 244 assert(offset); 245 assert(*offset <= size); 246 assert(size == STR_NO_LIMIT || s + size >= s); 247 assert(mb); 248 249 if (*offset == size) 250 return 0; 251 252 if (_likely(!mb->state)) { 253 /* Clean slate, read initial byte. */ 254 uint8_t b = s[(*offset)++]; 255 256 /* Fast exit for the most common case. */ 257 if (_likely(_is_ascii(b))) 258 return b; 259 260 /* unexpected continuation byte */ 261 if (_unlikely(_is_continuation(b))) 262 return CHAR_INVALID; 263 264 /* 265 * The value stored into `continuation` is designed to have 266 * just enough leading ones that after shifting in one less than 267 * the expected number of continuation bytes, the most significant 268 * bit becomes zero. (The field is 16b wide.) 269 */ 270 271 if (_is_2_byte(b)) { 272 /* Reject non-shortest form. */ 273 if (_unlikely(!(b & 0b00011110))) 274 return CHAR_INVALID; 275 276 #if FAST_PATHS 277 /* We can usually take this exit. */ 278 if (_likely(*offset < size && _is_continuation(s[*offset]))) 279 return (b & UTF8_MASK_INITIAL2) << 6 | 280 (s[(*offset)++] & UTF8_MASK_CONT); 281 #endif 282 283 /* 2 byte continuation 110xxxxx */ 284 mb->state = b ^ 0b0000000011000000; 285 286 } else if (_is_3_byte(b)) { 287 #if FAST_PATHS 288 /* We can usually take this exit. */ 289 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) { 290 291 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 | 292 (s[(*offset)] & UTF8_MASK_CONT) << 6 | 293 (s[(*offset) + 1] & UTF8_MASK_CONT); 294 295 *offset += 2; 296 297 /* Reject non-shortest form. */ 298 if (_unlikely(!(ch & 0xFFFFF800))) 299 return CHAR_INVALID; 300 301 return ch; 302 } 303 #endif 304 305 /* 3 byte continuation 1110xxxx */ 306 mb->state = b ^ 0b1111110011100000; 307 308 } else if (_is_4_byte(b)) { 309 #if FAST_PATHS 310 /* We can usually take this exit. */ 311 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) && 312 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) { 313 314 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 | 315 (s[(*offset)] & UTF8_MASK_CONT) << 12 | 316 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 | 317 (s[(*offset) + 2] & UTF8_MASK_CONT); 318 319 *offset += 3; 320 321 /* Reject non-shortest form. */ 322 if (_unlikely(!(ch & 0xFFFF0000))) 323 return CHAR_INVALID; 324 325 return ch; 326 } 327 #endif 328 329 /* 4 byte continuation 11110xxx */ 330 mb->state = b ^ 0b1111111100000000; 331 } else { 332 return CHAR_INVALID; 333 } 334 } 335 336 /* Deal with the remaining edge and invalid cases. */ 337 for (; *offset < size; (*offset)++) { 338 /* Read continuation bytes. */ 339 uint8_t b = s[*offset]; 340 341 if (!_is_continuation(b) || _is_non_shortest(mb, b)) { 342 mb->state = 0; 343 return CHAR_INVALID; 344 } 345 346 /* Top bit becomes zero when shifting in the second to last byte. */ 347 if (!(mb->state & 0x8000)) { 348 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT); 349 mb->state = 0; 350 (*offset)++; 351 return c; 352 } 353 354 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT); 355 } 356 357 /* Incomplete character. */ 358 assert(mb->state); 359 return 0; 360 } 361 362 /** Standard <uchar.h> function since C11. */ 363 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb) 364 { 365 #if __STDC_HOSTED__ 366 static fibril_local mbstate_t global_state = { }; 367 368 if (!mb) 369 mb = &global_state; 370 #endif 371 372 if (!s) { 373 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */ 374 c = NULL; 375 s = ""; 376 n = 1; 377 } 378 379 size_t offset = 0; 380 char32_t ret = _str_decode(s, &offset, n, mb); 381 if (ret == CHAR_INVALID) { 382 assert(!mb->state); 383 _set_ilseq(); 384 return UCHAR_ILSEQ; 385 } 386 if (mb->state) { 387 assert(ret == 0); 388 return UCHAR_INCOMPLETE; 389 } 390 391 if (c) 392 *c = ret; 393 return ret ? offset : 0; 394 } 132 395 133 396 /** Decode a single character from a string. … … 148 411 char32_t str_decode(const char *str, size_t *offset, size_t size) 149 412 { 150 if (*offset + 1 > size) 151 return 0; 152 153 /* First byte read from string */ 154 uint8_t b0 = (uint8_t) str[(*offset)++]; 155 156 /* Determine code length */ 157 158 unsigned int b0_bits; /* Data bits in first byte */ 159 unsigned int cbytes; /* Number of continuation bytes */ 160 161 if ((b0 & 0x80) == 0) { 162 /* 0xxxxxxx (Plain ASCII) */ 163 b0_bits = 7; 164 cbytes = 0; 165 } else if ((b0 & 0xe0) == 0xc0) { 166 /* 110xxxxx 10xxxxxx */ 167 b0_bits = 5; 168 cbytes = 1; 169 } else if ((b0 & 0xf0) == 0xe0) { 170 /* 1110xxxx 10xxxxxx 10xxxxxx */ 171 b0_bits = 4; 172 cbytes = 2; 173 } else if ((b0 & 0xf8) == 0xf0) { 174 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 175 b0_bits = 3; 176 cbytes = 3; 177 } else { 178 /* 10xxxxxx -- unexpected continuation byte */ 413 mbstate_t mb = { }; 414 char32_t ch = _str_decode(str, offset, size, &mb); 415 416 if (ch == CHAR_INVALID) 179 417 return U_SPECIAL; 180 } 181 182 if (*offset + cbytes > size) 418 419 if (mb.state) 183 420 return U_SPECIAL; 184 185 char32_t ch = b0 & LO_MASK_8(b0_bits);186 187 /* Decode continuation bytes */188 while (cbytes > 0) {189 uint8_t b = (uint8_t) str[(*offset)++];190 191 /* Must be 10xxxxxx */192 if ((b & 0xc0) != 0x80)193 return U_SPECIAL;194 195 /* Shift data bits to ch */196 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));197 cbytes--;198 }199 421 200 422 return ch; … … 221 443 return 0; 222 444 223 size_t processed= 0;445 int cbytes = 0; 224 446 /* Continue while continuation bytes found */ 225 while (*offset > 0 && processed< 4) {447 while (*offset > 0 && cbytes < 4) { 226 448 uint8_t b = (uint8_t) str[--(*offset)]; 227 449 228 if (processed == 0 && (b & 0x80) == 0) { 229 /* 0xxxxxxx (Plain ASCII) */ 230 return b & 0x7f; 231 } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 || 232 (b & 0xf8) == 0xf0) { 233 /* Start byte */ 234 size_t start_offset = *offset; 235 return str_decode(str, &start_offset, size); 236 } else if ((b & 0xc0) != 0x80) { 237 /* Not a continuation byte */ 450 if (_is_continuation(b)) { 451 cbytes++; 452 continue; 453 } 454 455 /* Reject non-shortest form encoding. */ 456 if (cbytes != _continuation_bytes(b)) 238 457 return U_SPECIAL; 239 } 240 processed++; 241 } 458 459 /* Start byte */ 460 size_t start_offset = *offset; 461 return str_decode(str, &start_offset, size); 462 } 463 242 464 /* Too many continuation bytes */ 243 465 return U_SPECIAL; … … 259 481 * code was invalid. 260 482 */ 261 errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size) 262 { 483 errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size) 484 { 485 // TODO: merge with c32rtomb() 486 263 487 if (*offset >= size) 264 488 return EOVERFLOW; 265 489 490 /* Fast exit for the most common case. */ 491 if (ch < 0x80) { 492 str[(*offset)++] = (char) ch; 493 return EOK; 494 } 495 496 /* Codes longer than 21 bits are not supported */ 266 497 if (!chr_check(ch)) 267 498 return EINVAL; 268 499 269 /*270 * Unsigned version of ch (bit operations should only be done271 * on unsigned types).272 */273 uint32_t cc = (uint32_t) ch;274 275 500 /* Determine how many continuation bytes are needed */ 276 501 277 unsigned int b0_bits; /* Data bits in first byte */ 278 unsigned int cbytes; /* Number of continuation bytes */ 279 280 if ((cc & ~LO_MASK_32(7)) == 0) { 281 b0_bits = 7; 282 cbytes = 0; 283 } else if ((cc & ~LO_MASK_32(11)) == 0) { 284 b0_bits = 5; 285 cbytes = 1; 286 } else if ((cc & ~LO_MASK_32(16)) == 0) { 287 b0_bits = 4; 288 cbytes = 2; 289 } else if ((cc & ~LO_MASK_32(21)) == 0) { 290 b0_bits = 3; 291 cbytes = 3; 292 } else { 293 /* Codes longer than 21 bits are not supported */ 294 return EINVAL; 295 } 502 unsigned int cbytes = _char_continuation_bytes(ch); 503 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */ 296 504 297 505 /* Check for available space in buffer */ … … 302 510 unsigned int i; 303 511 for (i = cbytes; i > 0; i--) { 304 str[*offset + i] = 0x80 | (c c& LO_MASK_32(CONT_BITS));305 c c = cc >>CONT_BITS;512 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS)); 513 ch >>= CONT_BITS; 306 514 } 307 515 308 516 /* Encode first byte */ 309 str[*offset] = (c c& LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);517 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); 310 518 311 519 /* Advance offset */ … … 315 523 } 316 524 525 /* Convert in place any bytes that don't form a valid character into U_SPECIAL. */ 526 static void _sanitize_string(char *str, size_t n) 527 { 528 uint8_t *b = (uint8_t *) str; 529 530 for (; *b && n > 0; b++, n--) { 531 int cont = _continuation_bytes(b[0]); 532 if (__builtin_expect(cont, 0) == 0) 533 continue; 534 535 if (cont < 0 || n <= (size_t) cont) { 536 b[0] = U_SPECIAL; 537 continue; 538 } 539 540 /* Check continuation bytes. */ 541 for (int i = 1; i <= cont; i++) { 542 if (!_is_continuation(b[i])) { 543 b[0] = U_SPECIAL; 544 continue; 545 } 546 } 547 548 /* 549 * Check for non-shortest form encoding. 550 * See https://www.unicode.org/versions/corrigendum1.html 551 */ 552 553 switch (cont) { 554 case 1: 555 /* 0b110!!!!x 0b10xxxxxx */ 556 if (!(b[0] & 0b00011110)) 557 b[0] = U_SPECIAL; 558 559 continue; 560 case 2: 561 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */ 562 if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000)) 563 b[0] = U_SPECIAL; 564 565 continue; 566 case 3: 567 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */ 568 if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000)) 569 b[0] = U_SPECIAL; 570 571 continue; 572 } 573 } 574 } 575 576 static size_t _str_size(const char *str) 577 { 578 size_t size = 0; 579 580 while (*str++ != 0) 581 size++; 582 583 return size; 584 } 585 317 586 /** Get size of string. 318 587 * … … 327 596 size_t str_size(const char *str) 328 597 { 329 size_t size = 0; 330 331 while (*str++ != 0) 332 size++; 333 334 return size; 598 return _str_size(str); 335 599 } 336 600 … … 378 642 } 379 643 644 static size_t _str_nsize(const char *str, size_t max_size) 645 { 646 size_t size = 0; 647 648 while ((*str++ != 0) && (size < max_size)) 649 size++; 650 651 return size; 652 } 653 380 654 /** Get size of string with size limit. 381 655 * … … 391 665 size_t str_nsize(const char *str, size_t max_size) 392 666 { 393 size_t size = 0; 394 395 while ((*str++ != 0) && (size < max_size)) 396 size++; 397 398 return size; 667 return _str_nsize(str, max_size); 399 668 } 400 669 … … 582 851 int str_cmp(const char *s1, const char *s2) 583 852 { 584 char32_t c1 = 0; 585 char32_t c2 = 0; 586 587 size_t off1 = 0; 588 size_t off2 = 0; 589 590 while (true) { 591 c1 = str_decode(s1, &off1, STR_NO_LIMIT); 592 c2 = str_decode(s2, &off2, STR_NO_LIMIT); 593 594 if (c1 < c2) 595 return -1; 596 597 if (c1 > c2) 598 return 1; 599 600 if (c1 == 0 || c2 == 0) 601 break; 602 } 603 604 return 0; 853 /* 854 * UTF-8 has the nice property that lexicographic ordering on bytes is 855 * the same as the lexicographic ordering of the character sequences. 856 */ 857 while (*s1 == *s2 && *s1 != 0) { 858 s1++; 859 s2++; 860 } 861 862 if (*s1 == *s2) 863 return 0; 864 865 return (*s1 < *s2) ? -1 : 1; 605 866 } 606 867 … … 681 942 int str_casecmp(const char *s1, const char *s2) 682 943 { 944 // FIXME: doesn't work for non-ASCII caseful characters 945 683 946 char32_t c1 = 0; 684 947 char32_t c2 = 0; … … 729 992 int str_lcasecmp(const char *s1, const char *s2, size_t max_len) 730 993 { 994 // FIXME: doesn't work for non-ASCII caseful characters 995 731 996 char32_t c1 = 0; 732 997 char32_t c2 = 0; … … 760 1025 } 761 1026 1027 static bool _test_prefix(const char *s, const char *p) 1028 { 1029 while (*s == *p && *s != 0) { 1030 s++; 1031 p++; 1032 } 1033 1034 return *p == 0; 1035 } 1036 762 1037 /** Test whether p is a prefix of s. 763 1038 * … … 773 1048 bool str_test_prefix(const char *s, const char *p) 774 1049 { 775 char32_t c1 = 0; 776 char32_t c2 = 0; 777 778 size_t off1 = 0; 779 size_t off2 = 0; 780 781 while (true) { 782 c1 = str_decode(s, &off1, STR_NO_LIMIT); 783 c2 = str_decode(p, &off2, STR_NO_LIMIT); 784 785 if (c2 == 0) 786 return true; 787 788 if (c1 != c2) 789 return false; 790 791 if (c1 == 0) 792 break; 793 } 794 795 return false; 1050 return _test_prefix(s, p); 796 1051 } 797 1052 … … 820 1075 821 1076 return s + off; 1077 } 1078 1079 /** Copy string as a sequence of bytes. */ 1080 static void _str_cpy(char *dest, const char *src) 1081 { 1082 while (*src) 1083 *(dest++) = *(src++); 1084 1085 *dest = 0; 1086 } 1087 1088 /** Copy string as a sequence of bytes. */ 1089 static void _str_cpyn(char *dest, size_t size, const char *src) 1090 { 1091 assert(dest && src && size); 1092 1093 if (!dest || !src || !size) 1094 return; 1095 1096 if (size == STR_NO_LIMIT) 1097 return _str_cpy(dest, src); 1098 1099 char *dest_top = dest + size - 1; 1100 assert(size == 1 || dest < dest_top); 1101 1102 while (*src && dest < dest_top) 1103 *(dest++) = *(src++); 1104 1105 *dest = 0; 822 1106 } 823 1107 … … 839 1123 assert(size > 0); 840 1124 assert(src != NULL); 841 842 size_t src_off = 0; 843 size_t dest_off = 0; 844 845 char32_t ch; 846 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) { 847 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 848 break; 849 } 850 851 dest[dest_off] = '\0'; 1125 assert(dest != NULL); 1126 assert(size == STR_NO_LIMIT || dest + size > dest); 1127 1128 /* Copy data. */ 1129 _str_cpyn(dest, size, src); 1130 1131 /* In-place translate invalid bytes to U_SPECIAL. */ 1132 _sanitize_string(dest, size); 852 1133 } 853 1134 … … 872 1153 /* There must be space for a null terminator in the buffer. */ 873 1154 assert(size > 0); 874 875 size_t src_off = 0; 876 size_t dest_off = 0; 877 878 char32_t ch; 879 while ((ch = str_decode(src, &src_off, n)) != 0) { 880 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 881 break; 882 } 883 884 dest[dest_off] = '\0'; 1155 assert(src != NULL); 1156 1157 /* Copy data. */ 1158 _str_cpyn(dest, min(size, n + 1), src); 1159 1160 /* In-place translate invalid bytes to U_SPECIAL. */ 1161 _sanitize_string(dest, size); 885 1162 } 886 1163 … … 898 1175 void str_append(char *dest, size_t size, const char *src) 899 1176 { 900 size_t dstr_size; 901 902 dstr_size = str_size(dest); 903 if (dstr_size >= size) 904 return; 905 906 str_cpy(dest + dstr_size, size - dstr_size, src); 1177 assert(src != NULL); 1178 assert(dest != NULL); 1179 assert(size > 0); 1180 assert(size == STR_NO_LIMIT || dest + size > dest); 1181 1182 size_t dstr_size = _str_nsize(dest, size); 1183 if (dstr_size < size) { 1184 _str_cpyn(dest + dstr_size, size - dstr_size, src); 1185 _sanitize_string(dest + dstr_size, size - dstr_size); 1186 } 907 1187 } 908 1188 … … 933 1213 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n) 934 1214 { 935 size_t sidx; 936 size_t didx; 937 size_t dlast; 938 uint8_t byte; 939 errno_t rc; 940 errno_t result; 941 942 /* There must be space for a null terminator in the buffer. */ 943 assert(size > 0); 944 result = EOK; 945 946 didx = 0; 947 dlast = 0; 948 for (sidx = 0; sidx < n; ++sidx) { 949 byte = src[sidx]; 950 if (!ascii_check(byte)) { 951 byte = U_SPECIAL; 1215 size_t len = 0; 1216 1217 /* Determine the length of the source string. */ 1218 for (size_t i = 0; i < n; i++) { 1219 if (src[i] == 0) 1220 break; 1221 1222 if (src[i] != ' ') 1223 len = i + 1; 1224 } 1225 1226 errno_t result = EOK; 1227 size_t out_len = min(len, size - 1); 1228 1229 /* Copy characters */ 1230 for (size_t i = 0; i < out_len; i++) { 1231 dest[i] = src[i]; 1232 1233 if (dest[i] < 0) { 1234 dest[i] = U_SPECIAL; 952 1235 result = EIO; 953 1236 } 954 955 rc = chr_encode(byte, dest, &didx, size - 1); 956 if (rc != EOK) { 957 assert(rc == EOVERFLOW); 958 dest[didx] = '\0'; 959 return rc; 960 } 961 962 /* Remember dest index after last non-empty character */ 963 if (byte != 0x20) 964 dlast = didx; 965 } 966 967 /* Terminate string after last non-empty character */ 968 dest[dlast] = '\0'; 1237 } 1238 1239 dest[out_len] = 0; 1240 1241 if (out_len < len) 1242 return EOVERFLOW; 1243 969 1244 return result; 970 1245 } … … 1207 1482 } 1208 1483 1484 static char *_strchr(const char *str, char c) 1485 { 1486 while (*str != 0 && *str != c) 1487 str++; 1488 1489 return (*str == c) ? (char *) str : NULL; 1490 } 1491 1209 1492 /** Find first occurence of character in string. 1210 1493 * … … 1216 1499 char *str_chr(const char *str, char32_t ch) 1217 1500 { 1218 char32_t acc; 1219 size_t off = 0; 1220 size_t last = 0; 1221 1222 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) { 1223 if (acc == ch) 1224 return (char *) (str + last); 1225 last = off; 1501 /* Fast path for an ASCII character. */ 1502 if (ascii_check(ch)) 1503 return _strchr(str, ch); 1504 1505 /* Convert character to UTF-8. */ 1506 char utf8[STR_BOUNDS(1) + 1]; 1507 size_t offset = 0; 1508 1509 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0) 1510 return NULL; 1511 1512 utf8[offset] = '\0'; 1513 1514 /* Find the first byte, then check if all of them are correct. */ 1515 while (*str != 0) { 1516 str = _strchr(str, utf8[0]); 1517 if (!str) 1518 return NULL; 1519 1520 if (_test_prefix(str, utf8)) 1521 return (char *) str; 1522 1523 str++; 1226 1524 } 1227 1525 … … 1238 1536 char *str_str(const char *hs, const char *n) 1239 1537 { 1240 size_t off = 0; 1241 1242 if (str_lcmp(hs, n, str_length(n)) == 0) 1243 return (char *)hs; 1244 1245 while (str_decode(hs, &off, STR_NO_LIMIT) != 0) { 1246 if (str_lcmp(hs + off, n, str_length(n)) == 0) 1247 return (char *)(hs + off); 1538 size_t hsize = _str_size(hs); 1539 size_t nsize = _str_size(n); 1540 1541 while (hsize >= nsize) { 1542 if (_test_prefix(hs, n)) 1543 return (char *) hs; 1544 1545 hs++; 1546 hsize--; 1248 1547 } 1249 1548 1250 1549 return NULL; 1550 } 1551 1552 static void _str_rtrim(char *str, char c) 1553 { 1554 char *last = str; 1555 1556 while (*str) { 1557 if (*str != c) 1558 last = str; 1559 1560 str++; 1561 } 1562 1563 /* Truncate string. */ 1564 last[1] = 0; 1251 1565 } 1252 1566 … … 1258 1572 void str_rtrim(char *str, char32_t ch) 1259 1573 { 1574 /* Fast path for the ASCII case. */ 1575 if (ascii_check(ch)) { 1576 _str_rtrim(str, ch); 1577 return; 1578 } 1579 1260 1580 size_t off = 0; 1261 1581 size_t pos = 0; … … 1279 1599 } 1280 1600 1601 static void _str_ltrim(char *str, char c) 1602 { 1603 char *p = str; 1604 1605 while (*p == c) 1606 p++; 1607 1608 if (str != p) 1609 _str_cpy(str, p); 1610 } 1611 1281 1612 /** Removes specified leading characters from a string. 1282 1613 * … … 1286 1617 void str_ltrim(char *str, char32_t ch) 1287 1618 { 1619 /* Fast path for the ASCII case. */ 1620 if (ascii_check(ch)) { 1621 _str_ltrim(str, ch); 1622 return; 1623 } 1624 1288 1625 char32_t acc; 1289 1626 size_t off = 0; … … 1305 1642 } 1306 1643 1644 static char *_str_rchr(const char *str, char c) 1645 { 1646 const char *last = NULL; 1647 1648 while (*str) { 1649 if (*str == c) 1650 last = str; 1651 1652 str++; 1653 } 1654 1655 return (char *) last; 1656 } 1657 1307 1658 /** Find last occurence of character in string. 1308 1659 * … … 1314 1665 char *str_rchr(const char *str, char32_t ch) 1315 1666 { 1667 if (ascii_check(ch)) 1668 return _str_rchr(str, ch); 1669 1316 1670 char32_t acc; 1317 1671 size_t off = 0; … … 1402 1756 char *str_dup(const char *src) 1403 1757 { 1404 size_t size = str_size(src) + 1;1758 size_t size = _str_size(src) + 1; 1405 1759 char *dest = malloc(size); 1406 1760 if (!dest) 1407 1761 return NULL; 1408 1762 1409 str_cpy(dest, size, src); 1763 memcpy(dest, src, size); 1764 _sanitize_string(dest, size); 1410 1765 return dest; 1411 1766 } … … 1433 1788 char *str_ndup(const char *src, size_t n) 1434 1789 { 1435 size_t size = str_size(src); 1436 if (size > n) 1437 size = n; 1790 size_t size = _str_nsize(src, n); 1438 1791 1439 1792 char *dest = malloc(size + 1); … … 1441 1794 return NULL; 1442 1795 1443 str_ncpy(dest, size + 1, src, size); 1796 memcpy(dest, src, size); 1797 _sanitize_string(dest, size); 1798 dest[size] = 0; 1444 1799 return dest; 1445 1800 }
Note:
See TracChangeset
for help on using the changeset viewer.