Changeset 28c39f3 in mainline for common/str.c
- Timestamp:
- 2025-04-13T16:56:27Z (10 days ago)
- Branches:
- master
- Children:
- 240b2e4, 97f6b71
- Parents:
- f167c851
- git-author:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-13 16:32:33)
- git-committer:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-13 16:56:27)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/str.c
rf167c851 r28c39f3 54 54 * are valid 55 55 * 56 * Note that Unicode characters do not match 57 * one-to-one with displayed characters or glyphs on 58 * screen. For that level of precision, look up 59 * Grapheme Clusters. 60 * 56 61 * ASCII character 7 bit encoded ASCII character, stored in char 57 62 * (usually signed 8 bit integer), code points 0 .. 127 … … 71 76 * [wide] string width number of display cells on a monospace display taken 72 77 * by a [wide] string, size_t 78 * 79 * This is virtually impossible to determine exactly for 80 * all strings without knowing specifics of the display 81 * device, due to various factors affecting text output. 82 * If you have the option to query the terminal for 83 * position change caused by outputting the string, 84 * it is preferrable to determine width that way. 73 85 * 74 86 * … … 108 120 #include <str.h> 109 121 122 #include <align.h> 110 123 #include <assert.h> 111 124 #include <ctype.h> 112 125 #include <errno.h> 126 #include <macros.h> 127 #include <mem.h> 113 128 #include <stdbool.h> 114 129 #include <stddef.h> 115 130 #include <stdint.h> 116 131 #include <stdlib.h> 117 118 #include <align.h> 119 #include <mem.h> 132 #include <uchar.h> 120 133 121 134 /** Byte mask consisting of lowest @n bits (out of 8) */ … … 130 143 /** Number of data bits in a UTF-8 continuation byte */ 131 144 #define CONT_BITS 6 145 146 static inline bool _is_ascii(uint8_t b) 147 { 148 return b < 0x80; 149 } 150 151 static inline bool _is_continuation_byte(uint8_t b) 152 { 153 return (b & 0xc0) == 0x80; 154 } 155 156 static inline int _char_continuation_bytes(char32_t c) 157 { 158 if ((c & ~LO_MASK_32(11)) == 0) 159 return 1; 160 161 if ((c & ~LO_MASK_32(16)) == 0) 162 return 2; 163 164 if ((c & ~LO_MASK_32(21)) == 0) 165 return 3; 166 167 /* Codes longer than 21 bits are not supported */ 168 return -1; 169 } 170 171 static inline int _continuation_bytes(uint8_t b) 172 { 173 /* 0xxxxxxx */ 174 if (_is_ascii(b)) 175 return 0; 176 177 /* 110xxxxx 10xxxxxx */ 178 if ((b & 0xe0) == 0xc0) 179 return 1; 180 181 /* 1110xxxx 10xxxxxx 10xxxxxx */ 182 if ((b & 0xf0) == 0xe0) 183 return 2; 184 185 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 186 if ((b & 0xf8) == 0xf0) 187 return 3; 188 189 return -1; 190 } 132 191 133 192 /** Decode a single character from a string. … … 154 213 uint8_t b0 = (uint8_t) str[(*offset)++]; 155 214 215 /* Fast exit for the most common case. */ 216 if (_is_ascii(b0)) 217 return b0; 218 219 /* 10xxxxxx -- unexpected continuation byte */ 220 if (_is_continuation_byte(b0)) 221 return U_SPECIAL; 222 156 223 /* Determine code length */ 157 224 158 unsigned int b0_bits; /* Data bits in first byte */ 159 unsigned int cbytes; /* Number of continuation bytes */ 160 161 if ((b0 & 0x80) == 0) { 162 /* 0xxxxxxx (Plain ASCII) */ 163 b0_bits = 7; 164 cbytes = 0; 165 } else if ((b0 & 0xe0) == 0xc0) { 166 /* 110xxxxx 10xxxxxx */ 167 b0_bits = 5; 168 cbytes = 1; 169 } else if ((b0 & 0xf0) == 0xe0) { 170 /* 1110xxxx 10xxxxxx 10xxxxxx */ 171 b0_bits = 4; 172 cbytes = 2; 173 } else if ((b0 & 0xf8) == 0xf0) { 174 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 175 b0_bits = 3; 176 cbytes = 3; 177 } else { 178 /* 10xxxxxx -- unexpected continuation byte */ 179 return U_SPECIAL; 180 } 225 unsigned int cbytes = _continuation_bytes(b0); 226 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */ 181 227 182 228 if (*offset + cbytes > size) … … 189 235 uint8_t b = (uint8_t) str[(*offset)++]; 190 236 191 /* Must be 10xxxxxx */ 192 if ((b & 0xc0) != 0x80) 237 if (!_is_continuation_byte(b)) 193 238 return U_SPECIAL; 194 239 … … 221 266 return 0; 222 267 223 size_t processed= 0;268 int cbytes = 0; 224 269 /* Continue while continuation bytes found */ 225 while (*offset > 0 && processed< 4) {270 while (*offset > 0 && cbytes < 4) { 226 271 uint8_t b = (uint8_t) str[--(*offset)]; 227 272 228 if (processed == 0 && (b & 0x80) == 0) { 229 /* 0xxxxxxx (Plain ASCII) */ 230 return b & 0x7f; 231 } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 || 232 (b & 0xf8) == 0xf0) { 233 /* Start byte */ 234 size_t start_offset = *offset; 235 return str_decode(str, &start_offset, size); 236 } else if ((b & 0xc0) != 0x80) { 237 /* Not a continuation byte */ 273 if (_is_continuation_byte(b)) { 274 cbytes++; 275 continue; 276 } 277 278 /* Invalid byte. */ 279 if (cbytes != _continuation_bytes(b)) 238 280 return U_SPECIAL; 239 } 240 processed++; 241 } 281 282 /* Start byte */ 283 size_t start_offset = *offset; 284 return str_decode(str, &start_offset, size); 285 } 286 242 287 /* Too many continuation bytes */ 243 288 return U_SPECIAL; … … 259 304 * code was invalid. 260 305 */ 261 errno_t chr_encode(c onst char32_t ch, char *str, size_t *offset, size_t size)306 errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size) 262 307 { 263 308 if (*offset >= size) 264 309 return EOVERFLOW; 265 310 311 /* Fast exit for the most common case. */ 312 if (ch < 0x80) { 313 str[(*offset)++] = (char) ch; 314 return EOK; 315 } 316 317 /* Codes longer than 21 bits are not supported */ 266 318 if (!chr_check(ch)) 267 319 return EINVAL; 268 320 269 /*270 * Unsigned version of ch (bit operations should only be done271 * on unsigned types).272 */273 uint32_t cc = (uint32_t) ch;274 275 321 /* Determine how many continuation bytes are needed */ 276 322 277 unsigned int b0_bits; /* Data bits in first byte */ 278 unsigned int cbytes; /* Number of continuation bytes */ 279 280 if ((cc & ~LO_MASK_32(7)) == 0) { 281 b0_bits = 7; 282 cbytes = 0; 283 } else if ((cc & ~LO_MASK_32(11)) == 0) { 284 b0_bits = 5; 285 cbytes = 1; 286 } else if ((cc & ~LO_MASK_32(16)) == 0) { 287 b0_bits = 4; 288 cbytes = 2; 289 } else if ((cc & ~LO_MASK_32(21)) == 0) { 290 b0_bits = 3; 291 cbytes = 3; 292 } else { 293 /* Codes longer than 21 bits are not supported */ 294 return EINVAL; 295 } 323 unsigned int cbytes = _char_continuation_bytes(ch); 324 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */ 296 325 297 326 /* Check for available space in buffer */ … … 302 331 unsigned int i; 303 332 for (i = cbytes; i > 0; i--) { 304 str[*offset + i] = 0x80 | (c c& LO_MASK_32(CONT_BITS));305 c c = cc >>CONT_BITS;333 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS)); 334 ch >>= CONT_BITS; 306 335 } 307 336 308 337 /* Encode first byte */ 309 str[*offset] = (c c& LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);338 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); 310 339 311 340 /* Advance offset */ … … 315 344 } 316 345 346 /* Convert in place any bytes that don't form a valid character into U_SPECIAL. */ 347 static void _repair_string(char *str, size_t n) 348 { 349 for (; *str && n > 0; str++, n--) { 350 int cont = _continuation_bytes(*str); 351 if (cont == 0) 352 continue; 353 354 if (cont < 0 || n <= (size_t) cont) { 355 *str = U_SPECIAL; 356 continue; 357 } 358 359 for (int i = 1; i <= cont; i++) { 360 if (!_is_continuation_byte(str[i])) { 361 *str = U_SPECIAL; 362 continue; 363 } 364 } 365 } 366 } 367 368 static size_t _str_size(const char *str) 369 { 370 size_t size = 0; 371 372 while (*str++ != 0) 373 size++; 374 375 return size; 376 } 377 317 378 /** Get size of string. 318 379 * … … 327 388 size_t str_size(const char *str) 328 389 { 329 size_t size = 0; 330 331 while (*str++ != 0) 332 size++; 333 334 return size; 390 return _str_size(str); 335 391 } 336 392 … … 378 434 } 379 435 436 static size_t _str_nsize(const char *str, size_t max_size) 437 { 438 size_t size = 0; 439 440 while ((*str++ != 0) && (size < max_size)) 441 size++; 442 443 return size; 444 } 445 380 446 /** Get size of string with size limit. 381 447 * … … 391 457 size_t str_nsize(const char *str, size_t max_size) 392 458 { 393 size_t size = 0; 394 395 while ((*str++ != 0) && (size < max_size)) 396 size++; 397 398 return size; 459 return _str_nsize(str, max_size); 399 460 } 400 461 … … 582 643 int str_cmp(const char *s1, const char *s2) 583 644 { 584 char32_t c1 = 0; 585 char32_t c2 = 0; 586 587 size_t off1 = 0; 588 size_t off2 = 0; 589 590 while (true) { 591 c1 = str_decode(s1, &off1, STR_NO_LIMIT); 592 c2 = str_decode(s2, &off2, STR_NO_LIMIT); 593 594 if (c1 < c2) 595 return -1; 596 597 if (c1 > c2) 598 return 1; 599 600 if (c1 == 0 || c2 == 0) 601 break; 602 } 603 604 return 0; 645 /* 646 * UTF-8 has the nice property that lexicographic ordering on bytes is 647 * the same as the lexicographic ordering of the character sequences. 648 */ 649 while (*s1 == *s2 && *s1 != 0) { 650 s1++; 651 s2++; 652 } 653 654 if (*s1 == *s2) 655 return 0; 656 657 return (*s1 < *s2) ? -1 : 1; 605 658 } 606 659 … … 681 734 int str_casecmp(const char *s1, const char *s2) 682 735 { 736 // FIXME: doesn't work for non-ASCII caseful characters 737 683 738 char32_t c1 = 0; 684 739 char32_t c2 = 0; … … 729 784 int str_lcasecmp(const char *s1, const char *s2, size_t max_len) 730 785 { 786 // FIXME: doesn't work for non-ASCII caseful characters 787 731 788 char32_t c1 = 0; 732 789 char32_t c2 = 0; … … 760 817 } 761 818 819 static bool _test_prefix(const char *s, const char *p) 820 { 821 while (*s == *p && *s != 0) { 822 s++; 823 p++; 824 } 825 826 return *p == 0; 827 } 828 762 829 /** Test whether p is a prefix of s. 763 830 * … … 773 840 bool str_test_prefix(const char *s, const char *p) 774 841 { 775 char32_t c1 = 0; 776 char32_t c2 = 0; 777 778 size_t off1 = 0; 779 size_t off2 = 0; 780 781 while (true) { 782 c1 = str_decode(s, &off1, STR_NO_LIMIT); 783 c2 = str_decode(p, &off2, STR_NO_LIMIT); 784 785 if (c2 == 0) 786 return true; 787 788 if (c1 != c2) 789 return false; 790 791 if (c1 == 0) 792 break; 793 } 794 795 return false; 842 return _test_prefix(s, p); 796 843 } 797 844 … … 820 867 821 868 return s + off; 869 } 870 871 /** Copy string as a sequence of bytes. */ 872 static void _str_cpy(char *dest, const char *src) 873 { 874 while (*src) 875 *(dest++) = *(src++); 876 877 *dest = 0; 878 } 879 880 /** Copy string as a sequence of bytes. */ 881 static void _str_cpyn(char *dest, size_t size, const char *src) 882 { 883 char *dest_top = dest + size - 1; 884 885 while (*src && dest < dest_top) 886 *(dest++) = *(src++); 887 888 *dest = 0; 822 889 } 823 890 … … 839 906 assert(size > 0); 840 907 assert(src != NULL); 841 842 size_t src_off = 0; 843 size_t dest_off = 0; 844 845 char32_t ch; 846 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) { 847 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 848 break; 849 } 850 851 dest[dest_off] = '\0'; 908 assert(dest != NULL); 909 910 /* Copy data. */ 911 _str_cpyn(dest, size, src); 912 913 /* In-place translate invalid bytes to U_SPECIAL. */ 914 _repair_string(dest, size); 852 915 } 853 916 … … 872 935 /* There must be space for a null terminator in the buffer. */ 873 936 assert(size > 0); 874 875 size_t src_off = 0; 876 size_t dest_off = 0; 877 878 char32_t ch; 879 while ((ch = str_decode(src, &src_off, n)) != 0) { 880 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 881 break; 882 } 883 884 dest[dest_off] = '\0'; 937 assert(src != NULL); 938 939 /* Copy data. */ 940 _str_cpyn(dest, min(size, n + 1), src); 941 942 /* In-place translate invalid bytes to U_SPECIAL. */ 943 _repair_string(dest, size); 885 944 } 886 945 … … 898 957 void str_append(char *dest, size_t size, const char *src) 899 958 { 900 size_t dstr_size;901 902 dstr_size = str_size(dest);903 if (dstr_size >= size) 904 return;905 906 str_cpy(dest + dstr_size, size - dstr_size, src);959 assert(src != NULL); 960 assert(dest != NULL); 961 assert(size > 0); 962 963 size_t dstr_size = _str_nsize(dest, size); 964 _str_cpyn(dest + dstr_size, size - dstr_size, src); 965 _repair_string(dest + dstr_size, size - dstr_size); 907 966 } 908 967 … … 933 992 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n) 934 993 { 935 size_t sidx; 936 size_t didx; 937 size_t dlast; 938 uint8_t byte; 939 errno_t rc; 940 errno_t result; 941 942 /* There must be space for a null terminator in the buffer. */ 943 assert(size > 0); 944 result = EOK; 945 946 didx = 0; 947 dlast = 0; 948 for (sidx = 0; sidx < n; ++sidx) { 949 byte = src[sidx]; 950 if (!ascii_check(byte)) { 951 byte = U_SPECIAL; 994 size_t len = 0; 995 996 /* Determine the length of the source string. */ 997 for (size_t i = 0; i < n; i++) { 998 if (src[i] == 0) 999 break; 1000 1001 if (src[i] != ' ') 1002 len = i + 1; 1003 } 1004 1005 errno_t result = EOK; 1006 size_t out_len = min(len, size - 1); 1007 1008 /* Copy characters */ 1009 for (size_t i = 0; i < out_len; i++) { 1010 dest[i] = src[i]; 1011 1012 if (dest[i] < 0) { 1013 dest[i] = U_SPECIAL; 952 1014 result = EIO; 953 1015 } 954 955 rc = chr_encode(byte, dest, &didx, size - 1); 956 if (rc != EOK) { 957 assert(rc == EOVERFLOW); 958 dest[didx] = '\0'; 959 return rc; 960 } 961 962 /* Remember dest index after last non-empty character */ 963 if (byte != 0x20) 964 dlast = didx; 965 } 966 967 /* Terminate string after last non-empty character */ 968 dest[dlast] = '\0'; 1016 } 1017 1018 dest[out_len] = 0; 1019 1020 if (out_len < len) 1021 return EOVERFLOW; 1022 969 1023 return result; 970 1024 } … … 1207 1261 } 1208 1262 1263 static char *_strchr(const char *str, char c) 1264 { 1265 while (*str != 0 && *str != c) 1266 str++; 1267 1268 return (*str == c) ? (char *) str : NULL; 1269 } 1270 1209 1271 /** Find first occurence of character in string. 1210 1272 * … … 1216 1278 char *str_chr(const char *str, char32_t ch) 1217 1279 { 1218 char32_t acc; 1219 size_t off = 0; 1220 size_t last = 0; 1221 1222 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) { 1223 if (acc == ch) 1224 return (char *) (str + last); 1225 last = off; 1280 /* Fast path for an ASCII character. */ 1281 if (ascii_check(ch)) 1282 return _strchr(str, ch); 1283 1284 /* Convert character to UTF-8. */ 1285 char utf8[STR_BOUNDS(1) + 1]; 1286 size_t offset = 0; 1287 1288 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0) 1289 return NULL; 1290 1291 utf8[offset] = '\0'; 1292 1293 /* Find the first byte, then check if all of them are correct. */ 1294 while (*str != 0) { 1295 str = _strchr(str, utf8[0]); 1296 if (!str) 1297 return NULL; 1298 1299 if (_test_prefix(str, utf8)) 1300 return (char *) str; 1301 1302 str++; 1226 1303 } 1227 1304 … … 1238 1315 char *str_str(const char *hs, const char *n) 1239 1316 { 1240 size_t off = 0; 1241 1242 if (str_lcmp(hs, n, str_length(n)) == 0) 1243 return (char *)hs; 1244 1245 while (str_decode(hs, &off, STR_NO_LIMIT) != 0) { 1246 if (str_lcmp(hs + off, n, str_length(n)) == 0) 1247 return (char *)(hs + off); 1317 size_t hsize = _str_size(hs); 1318 size_t nsize = _str_size(n); 1319 1320 while (hsize >= nsize) { 1321 if (_test_prefix(hs, n)) 1322 return (char *) hs; 1323 1324 hs++; 1325 hsize--; 1248 1326 } 1249 1327 1250 1328 return NULL; 1329 } 1330 1331 static void _str_rtrim(char *str, char c) 1332 { 1333 char *last = str; 1334 1335 while (*str) { 1336 if (*str != c) 1337 last = str; 1338 1339 str++; 1340 } 1341 1342 /* Truncate string. */ 1343 last[1] = 0; 1251 1344 } 1252 1345 … … 1258 1351 void str_rtrim(char *str, char32_t ch) 1259 1352 { 1353 /* Fast path for the ASCII case. */ 1354 if (ascii_check(ch)) { 1355 _str_rtrim(str, ch); 1356 return; 1357 } 1358 1260 1359 size_t off = 0; 1261 1360 size_t pos = 0; … … 1279 1378 } 1280 1379 1380 static void _str_ltrim(char *str, char c) 1381 { 1382 char *p = str; 1383 1384 while (*p == c) 1385 p++; 1386 1387 if (str != p) 1388 _str_cpy(str, p); 1389 } 1390 1281 1391 /** Removes specified leading characters from a string. 1282 1392 * … … 1286 1396 void str_ltrim(char *str, char32_t ch) 1287 1397 { 1398 /* Fast path for the ASCII case. */ 1399 if (ascii_check(ch)) { 1400 _str_ltrim(str, ch); 1401 return; 1402 } 1403 1288 1404 char32_t acc; 1289 1405 size_t off = 0; … … 1305 1421 } 1306 1422 1423 static char *_str_rchr(const char *str, char c) 1424 { 1425 const char *last = NULL; 1426 1427 while (*str) { 1428 if (*str == c) 1429 last = str; 1430 1431 str++; 1432 } 1433 1434 return (char *) last; 1435 } 1436 1307 1437 /** Find last occurence of character in string. 1308 1438 * … … 1314 1444 char *str_rchr(const char *str, char32_t ch) 1315 1445 { 1446 if (ascii_check(ch)) 1447 return _str_rchr(str, ch); 1448 1316 1449 char32_t acc; 1317 1450 size_t off = 0; … … 1402 1535 char *str_dup(const char *src) 1403 1536 { 1404 size_t size = str_size(src) + 1;1537 size_t size = _str_size(src) + 1; 1405 1538 char *dest = malloc(size); 1406 1539 if (!dest) 1407 1540 return NULL; 1408 1541 1409 str_cpy(dest, size, src); 1542 _str_cpy(dest, src); 1543 _repair_string(dest, size); 1410 1544 return dest; 1411 1545 } … … 1433 1567 char *str_ndup(const char *src, size_t n) 1434 1568 { 1435 size_t size = str_size(src); 1436 if (size > n) 1437 size = n; 1438 1439 char *dest = malloc(size + 1); 1569 size_t size = _str_nsize(src, n) + 1; 1570 1571 char *dest = malloc(size); 1440 1572 if (!dest) 1441 1573 return NULL; 1442 1574 1443 str_ncpy(dest, size + 1, src, size); 1575 _str_cpyn(dest, size, src); 1576 _repair_string(dest, size); 1444 1577 return dest; 1445 1578 }
Note:
See TracChangeset
for help on using the changeset viewer.