Changeset 65bf084 in mainline for common/stdc/uchar.c
- Timestamp:
- 2025-04-15T18:53:31Z (19 hours ago)
- Branches:
- master
- Children:
- bfee444, f444633
- Parents:
- f94a11f
- git-author:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-15 18:42:32)
- git-committer:
- Jiří Zárevúcky <zarevucky.jiri@…> (2025-04-15 18:53:31)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/stdc/uchar.c
rf94a11f r65bf084 59 59 } 60 60 61 static bool _is_continuation(uint8_t c)62 {63 return (c & 0xC0) == 0x80;64 }65 66 static bool _is_1_byte(uint8_t c)67 {68 return (c & 0x80) == 0;69 }70 71 static bool _is_2_byte(uint8_t c)72 {73 return (c & 0xE0) == 0xC0;74 }75 76 static bool _is_3_byte(uint8_t c)77 {78 return (c & 0xF0) == 0xE0;79 }80 81 static bool _is_4_byte(uint8_t c)82 {83 return (c & 0xF8) == 0xF0;84 }85 86 static bool _is_non_shortest(unsigned short cont, uint8_t b)87 {88 return (cont == 0b1111110000000000 && !(b & 0b00100000)) ||89 (cont == 0b1111111111110000 && !(b & 0b00110000));90 }91 92 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)93 {94 #if __STDC_HOSTED__95 static fibril_local mbstate_t global_state = { };96 97 if (!mb)98 mb = &global_state;99 #else100 assert(mb);101 #endif102 103 if (n == 0)104 return UCHAR_INCOMPLETE;105 106 char32_t dummy;107 108 if (!c)109 c = &dummy;110 111 if (!s) {112 // Equivalent to mbrtoc32(NULL, "", 1, mb).113 if (mb->state) {114 _set_ilseq();115 return UCHAR_ILSEQ;116 } else {117 return 0;118 }119 }120 121 size_t i = 0;122 123 if (!mb->state) {124 /* Clean slate, read initial byte. */125 126 uint8_t b = s[i++];127 128 if (_is_1_byte(b)) {129 *c = b;130 return b == 0 ? 0 : 1;131 }132 133 if (_is_continuation(b)) {134 /* unexpected continuation byte */135 _set_ilseq();136 return UCHAR_ILSEQ;137 }138 139 /*140 * The value stored into `continuation` is designed to have141 * just enough leading ones that after shifting in one less than142 * the expected number of continuation bytes, the most significant143 * bit becomes zero. (The field is 16b wide.)144 */145 146 if (_is_2_byte(b)) {147 /* Reject non-shortest form. */148 if (!(b & 0b00011110)) {149 _set_ilseq();150 return UCHAR_ILSEQ;151 }152 153 /* 2 byte encoding 110xxxxx */154 mb->state = b ^ 0b0000000011000000;155 156 } else if (_is_3_byte(b)) {157 /* 3 byte encoding 1110xxxx */158 mb->state = b ^ 0b1111110011100000;159 160 } else if (_is_4_byte(b)) {161 /* 4 byte encoding 11110xxx */162 mb->state = b ^ 0b1111111100000000;163 }164 }165 166 for (; i < n; i++) {167 /* Read continuation bytes. */168 uint8_t b = s[i];169 170 if (!_is_continuation(b) || _is_non_shortest(mb->state, b)) {171 _set_ilseq();172 return UCHAR_ILSEQ;173 }174 175 /* Top bit becomes zero just before the last byte is shifted in. */176 if (!(mb->state & 0x8000)) {177 *c = ((char32_t) mb->state) << 6 | (b & 0x3f);178 mb->state = 0;179 return ++i;180 }181 182 mb->state = mb->state << 6 | (b & 0x3f);183 }184 185 return UCHAR_INCOMPLETE;186 }187 188 61 #define UTF8_CONT(c, shift) (0x80 | (((c) >> (shift)) & 0x3F)) 189 62
Note:
See TracChangeset
for help on using the changeset viewer.