Changeset 74c8da2c in mainline
- Timestamp:
- 2009-03-24T14:41:31Z (16 years ago)
- Branches:
- lfn, master, serial, ticket/834-toolchain-update, topic/msim-upgrade, topic/simplify-dev-export
- Children:
- eec616b
- Parents:
- 4ccdcf6
- Location:
- kernel/generic
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
kernel/generic/include/string.h
r4ccdcf6 r74c8da2c 38 38 #include <typedefs.h> 39 39 40 extern wchar_t utf8_decode(const char *str, index_t *index); 40 #define UTF8_NO_LIMIT ((index_t) -1) 41 42 extern char invalch; 43 44 extern wchar_t utf8_decode(const char *str, index_t *index, index_t limit); 45 extern bool utf8_encode(const wchar_t ch, char *str, index_t *index, index_t limit); 46 extern size_t utf8_count_bytes(const char *str, count_t count); 47 extern bool ascii_check(const wchar_t ch); 48 extern bool unicode_check(const wchar_t ch); 49 41 50 extern size_t strlen(const char *str); 51 extern size_t strlen_utf8(const char *str); 52 extern size_t strlen_utf32(const wchar_t *str); 53 42 54 extern int strcmp(const char *src, const char *dst); 43 55 extern int strncmp(const char *src, const char *dst, size_t len); -
kernel/generic/src/lib/string.c
r4ccdcf6 r74c8da2c 43 43 #include <console/kconsole.h> 44 44 45 char invalch = '?'; 46 45 47 /** Decode a single UTF-8 character from a NULL-terminated string. 46 48 * … … 52 54 * @param index Index (counted in plain characters) where to start 53 55 * the decoding. 56 * @param limit Maximal allowed value of index. 54 57 * 55 58 * @return Decoded character in UTF-32 or '?' if the encoding is wrong. 56 59 * 57 60 */ 58 wchar_t utf8_decode(const char *str, index_t *index )61 wchar_t utf8_decode(const char *str, index_t *index, index_t limit) 59 62 { 60 63 uint8_t c1; /* First plain character from str */ … … 63 66 uint8_t c4; /* Fourth plain character from str */ 64 67 68 if (*index > limit) 69 return invalch; 70 65 71 c1 = (uint8_t) str[*index]; 66 72 … … 68 74 /* Plain ASCII (code points 0 .. 127) */ 69 75 return (wchar_t) c1; 70 } else if ((c1 & 0xe0) == 0xc0) { 76 } 77 78 if ((c1 & 0xe0) == 0xc0) { 71 79 /* Code points 128 .. 2047 */ 80 if (*index + 1 > limit) 81 return invalch; 82 72 83 c2 = (uint8_t) str[*index + 1]; 73 84 if ((c2 & 0xc0) == 0x80) { … … 75 86 return ((wchar_t) ((c1 & 0x1f) << 6) | (c2 & 0x3f)); 76 87 } else 77 return ((wchar_t) '?'); 78 } else if ((c1 & 0xf0) == 0xe0) { 88 return invalch; 89 } 90 91 if ((c1 & 0xf0) == 0xe0) { 79 92 /* Code points 2048 .. 65535 */ 93 if (*index + 2 > limit) 94 return invalch; 95 80 96 c2 = (uint8_t) str[*index + 1]; 81 97 if ((c2 & 0xc0) == 0x80) { … … 86 102 return ((wchar_t) ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f)); 87 103 } else 88 return ((wchar_t) '?');104 return invalch; 89 105 } else 90 return ((wchar_t) '?'); 91 } else if ((c1 & 0xf8) == 0xf0) { 106 return invalch; 107 } 108 109 if ((c1 & 0xf8) == 0xf0) { 92 110 /* Code points 65536 .. 1114111 */ 111 if (*index + 3 > limit) 112 return invalch; 113 93 114 c2 = (uint8_t) str[*index + 1]; 94 115 if ((c2 & 0xc0) == 0x80) { … … 102 123 return ((wchar_t) ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f)); 103 124 } else 104 return ((wchar_t) '?');125 return invalch; 105 126 } else 106 return ((wchar_t) '?');127 return invalch; 107 128 } else 108 return ((wchar_t) '?'); 109 } 110 111 return ((wchar_t) '?'); 112 } 113 114 /** Return number of characters in a string. 115 * 116 * @param str NULL terminated string. 129 return invalch; 130 } 131 132 return invalch; 133 } 134 135 /** Encode a single UTF-32 character as UTF-8 136 * 137 * Encode a single UTF-32 character as UTF-8 and store it into 138 * the given buffer at @index. Encoding starts at @index and 139 * this index is incremented if the UTF-8 character takes 140 * more than a single byte. 141 * 142 * @param ch Input UTF-32 character. 143 * @param str Output buffer. 144 * @param index Index (counted in plain characters) where to start 145 * the encoding 146 * @param limit Maximal allowed value of index. 147 * 148 * @return True if the character was encoded or false if there is not 149 * enought space in the output buffer or the character is invalid 150 * Unicode code point. 151 * 152 */ 153 bool utf8_encode(const wchar_t ch, char *str, index_t *index, index_t limit) 154 { 155 if (*index > limit) 156 return false; 157 158 if ((ch >= 0) && (ch <= 127)) { 159 /* Plain ASCII (code points 0 .. 127) */ 160 str[*index] = ch & 0x7f; 161 return true; 162 } 163 164 if ((ch >= 128) && (ch <= 2047)) { 165 /* Code points 128 .. 2047 */ 166 if (*index + 1 > limit) 167 return false; 168 169 str[*index] = 0xc0 | ((ch >> 6) & 0x1f); 170 (*index)++; 171 str[*index] = 0x80 | (ch & 0x3f); 172 return true; 173 } 174 175 if ((ch >= 2048) && (ch <= 65535)) { 176 /* Code points 2048 .. 65535 */ 177 if (*index + 2 > limit) 178 return false; 179 180 str[*index] = 0xe0 | ((ch >> 12) & 0x0f); 181 (*index)++; 182 str[*index] = 0x80 | ((ch >> 6) & 0x3f); 183 (*index)++; 184 str[*index] = 0x80 | (ch & 0x3f); 185 return true; 186 } 187 188 if ((ch >= 65536) && (ch <= 1114111)) { 189 /* Code points 65536 .. 1114111 */ 190 if (*index + 3 > limit) 191 return false; 192 193 str[*index] = 0xf0 | ((ch >> 18) & 0x07); 194 (*index)++; 195 str[*index] = 0x80 | ((ch >> 12) & 0x3f); 196 (*index)++; 197 str[*index] = 0x80 | ((ch >> 6) & 0x3f); 198 (*index)++; 199 str[*index] = 0x80 | (ch & 0x3f); 200 return true; 201 } 202 203 return false; 204 } 205 206 /** Get bytes used by UTF-8 characters. 207 * 208 * Get the number of bytes (count of plain characters) which 209 * are used by a given count of UTF-8 characters in a string. 210 * As UTF-8 encoding is multibyte, there is no constant 211 * correspondence between number of characters and used bytes. 212 * 213 * @param str UTF-8 string to consider. 214 * @param count Number of UTF-8 characters to count. 215 * 216 * @return Number of bytes used by the characters. 217 * 218 */ 219 size_t utf8_count_bytes(const char *str, count_t count) 220 { 221 size_t size = 0; 222 index_t index = 0; 223 224 while ((utf8_decode(str, &index, UTF8_NO_LIMIT) != 0) && (size < count)) { 225 size++; 226 index++; 227 } 228 229 return index; 230 } 231 232 /** Check whether character is plain ASCII. 233 * 234 * @return True if character is plain ASCII. 235 * 236 */ 237 bool ascii_check(const wchar_t ch) 238 { 239 if ((ch >= 0) && (ch <= 127)) 240 return true; 241 242 return false; 243 } 244 245 /** Check whether character is Unicode. 246 * 247 * @return True if character is valid Unicode code point. 248 * 249 */ 250 bool unicode_check(const wchar_t ch) 251 { 252 if ((ch >= 0) && (ch <= 1114111)) 253 return true; 254 255 return false; 256 } 257 258 /** Return number of plain characters in a string. 259 * 260 * @param str NULL-terminated string. 117 261 * 118 262 * @return Number of characters in str. … … 121 265 size_t strlen(const char *str) 122 266 { 123 int i; 124 125 for (i = 0; str[i]; i++); 126 127 return i; 267 size_t size; 268 for (size = 0; str[size]; size++); 269 270 return size; 271 } 272 273 /** Return number of UTF-8 characters in a string. 274 * 275 * @param str NULL-terminated UTF-8 string. 276 * 277 * @return Number of UTF-8 characters in str. 278 * 279 */ 280 size_t strlen_utf8(const char *str) 281 { 282 size_t size = 0; 283 index_t index = 0; 284 285 while (utf8_decode(str, &index, UTF8_NO_LIMIT) != 0) { 286 size++; 287 index++; 288 } 289 290 return size; 291 } 292 293 /** Return number of UTF-32 characters in a string. 294 * 295 * @param str NULL-terminated UTF-32 string. 296 * 297 * @return Number of UTF-32 characters in str. 298 * 299 */ 300 size_t strlen_utf32(const wchar_t *str) 301 { 302 size_t size; 303 for (size = 0; str[size]; size++); 304 305 return size; 128 306 } 129 307
Note:
See TracChangeset
for help on using the changeset viewer.