Diff [1d3ae6643d47a622f6361b85cfb0ca0797779519:f444633cb532b1998590464404848c30ac6c6c32] for / – HelenOS

common/include/str.h

r1d3ae66	rf444633
162	162	extern void bin_order_suffix(const uint64_t, uint64_t , const char *, bool);
163	163
164		~~extern size_t str_sanitize(char *str, size_t n, uint8_t replacement);~~
165
166	164	/*
167	165	* TODO: Get rid of this.

common/str.c

-              r1d3ae66
+              rf444633
+}
-static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
+{
-        return (mb->state == 0b1111110000001101 && b >= 0xa0);
+}
 #define _likely(expr) __builtin_expect((expr), true)
 #define _unlikely(expr) __builtin_expect((expr), false)
 …
                                         return CHAR_INVALID;
-                                /* Reject surrogates */
-                                if (_unlikely(ch >= 0xD800 && ch < 0xE000))
-                                        return CHAR_INVALID;
                                 return ch;
+                        }
 …
                                         return CHAR_INVALID;
-                                /* Reject out-of-range characters. */
-                                if (_unlikely(ch >= 0x110000))
-                                        return CHAR_INVALID;
                                 return ch;
+                        }
 …
                 uint8_t b = s[*offset];
                 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
+                if (!_is_continuation(b) || _is_non_shortest(mb, b)) {
                         mb->state = 0;
                         return CHAR_INVALID;
 …
+}
 /* Convert in place any bytes that don't form a valid character into replacement. */
 static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
+/* Convert in place any bytes that don't form a valid character into U_SPECIAL. */
+static void _sanitize_string(char *str, size_t n)
+{
         uint8_t *b = (uint8_t *) str;
+        size_t count = 0;
+        for (; n > 0 && b[0]; b++, n--) {
+        for (; *b && n > 0; b++, n--) {
                 int cont = _continuation_bytes(b[0]);
                 if (__builtin_expect(cont, 0) == 0)
 …
                 if (cont < 0 || n <= (size_t) cont) {
+                        b[0] = replacement;
+                        count++;
+                        b[0] = U_SPECIAL;
                         continue;
+                }
                 /* Check continuation bytes. */
-                bool valid = true;
                 for (int i = 1; i <= cont; i++) {
                         if (!_is_continuation(b[i])) {
                                 valid = false;
                                 break;
+                                b[0] = U_SPECIAL;
+                                continue;
+                        }
+                }
-                if (!valid) {
-                        b[0] = replacement;
-                        count++;
-                        continue;
+                }
 …
                  */
+                /* 0b110!!!!x 0b10xxxxxx */
+                if (cont == 1 && !(b[0] & 0b00011110)) {
+                        b[0] = replacement;
+                        count++;
+                switch (cont) {
+                case 1:
+                        /* 0b110!!!!x 0b10xxxxxx */
+                        if (!(b[0] & 0b00011110))
+                                b[0] = U_SPECIAL;
+                        continue;
+                case 2:
+                        /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
+                        if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000))
+                                b[0] = U_SPECIAL;
+                        continue;
+                case 3:
+                        /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
+                        if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000))
+                                b[0] = U_SPECIAL;
                         continue;
+                }
+                /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
+                if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
+                if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for surrogate character encoding. */
+                if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for out-of-range code points. */
+                if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                b += cont;
+                n -= cont;
+        }
+        return count;
+}
+size_t str_sanitize(char *str, size_t n, uint8_t replacement)
+{
+        return _str_sanitize(str, n, replacement);
+        }
+}
 …
         /* In-place translate invalid bytes to U_SPECIAL. */
         _str_sanitize(dest, size, U_SPECIAL);
+        _sanitize_string(dest, size);
+}
 …
         /* In-place translate invalid bytes to U_SPECIAL. */
         _str_sanitize(dest, size, U_SPECIAL);
+        _sanitize_string(dest, size);
+}
 …
         if (dstr_size < size) {
                 _str_cpyn(dest + dstr_size, size - dstr_size, src);
                 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
+                _sanitize_string(dest + dstr_size, size - dstr_size);
+        }
+}
 …
         memcpy(dest, src, size);
         _str_sanitize(dest, size, U_SPECIAL);
+        _sanitize_string(dest, size);
         return dest;
+}
 …
         memcpy(dest, src, size);
         _str_sanitize(dest, size, U_SPECIAL);
+        _sanitize_string(dest, size);
         dest[size] = 0;
         return dest;

uspace/lib/c/arch/arm32/src/atomic.c

-              r1d3ae66
+              rf444633
 volatile unsigned *ras_page;
-unsigned long long __atomic_load_8(const volatile void *mem0, int model)
+{
-        const volatile unsigned long long *mem = mem0;
-        (void) model;
-        unsigned long long ret;
-        /*
-         * The following instructions between labels 1 and 2 constitute a
-         * Restartable Atomic Seqeunce. Should the sequence be non-atomic,
-         * the kernel will restart it.
-         */
-        asm volatile (
-            "1:\n"
-            "   adr %[ret], 1b\n"
-            "   str %[ret], %[rp0]\n"
-            "   adr %[ret], 2f\n"
-            "   str %[ret], %[rp1]\n"
-            "   ldrd %[ret], %[addr]\n"
-            "2:\n"
-            : [ret] "=&r" (ret),
-              [rp0] "=m" (ras_page[0]),
-              [rp1] "=m" (ras_page[1])
-            : [addr] "m" (*mem)
-        );
-        ras_page[0] = 0;
-        ras_page[1] = 0xffffffff;
-        return ret;
+}
-void __atomic_store_8(volatile void *mem0, unsigned long long val, int model)
+{
-        volatile unsigned long long *mem = mem0;
-        (void) model;
-        /* scratch register */
-        unsigned tmp;
-        /*
-         * The following instructions between labels 1 and 2 constitute a
-         * Restartable Atomic Seqeunce. Should the sequence be non-atomic,
-         * the kernel will restart it.
-         */
-        asm volatile (
-            "1:\n"
-            "   adr %[tmp], 1b\n"
-            "   str %[tmp], %[rp0]\n"
-            "   adr %[tmp], 2f\n"
-            "   str %[tmp], %[rp1]\n"
-            "   strd %[imm], %[addr]\n"
-            "2:\n"
-            : [tmp] "=&r" (tmp),
-              [rp0] "=m" (ras_page[0]),
-              [rp1] "=m" (ras_page[1]),
-              [addr] "=m" (*mem)
-            : [imm] "r" (val)
-        );
-        ras_page[0] = 0;
-        ras_page[1] = 0xffffffff;
+}
 bool __atomic_compare_exchange_4(volatile void *mem0, void *expected0,
     unsigned desired, bool weak, int success, int failure)

uspace/lib/c/test/str.c

-              r1d3ae66
+              rf444633
 #include "pcut/asserts.h"
-#include <assert.h>
-#include <stdint.h>
 #include <stdio.h>
 #include <str.h>
 …
+{
         memset(buffer, 0, BUFFER_SIZE);
+}
-/* Helper to display string contents for debugging */
-static void print_string_hex(char *out, const char *s, size_t len)
+{
-        *out++ = '"';
-        for (size_t i = 0; i < len && s[i]; i++) {
-                if (s[i] >= 32 && s[i] <= 126)
-                        *out++ = s[i];
-                else
-                        out += snprintf(out, 5, "\\x%02x", (uint8_t) s[i]);
+        }
-        *out++ = '"';
-        *out++ = 0;
+}
 …
+{
         /* Overlong zero. */
         const char overlong1[] = "\xC0\x80";
         const char overlong2[] = "\xE0\x80\x80";
         const char overlong3[] = "\xF0\x80\x80\x80";
+        const char overlong1[] = { 0b11000000, 0b10000000, 0 };
+        const char overlong2[] = { 0b11100000, 0b10000000, 0 };
+        const char overlong3[] = { 0b11110000, 0b10000000, 0 };
         const char overlong4[] = "\xC1\xBF";
         const char overlong5[] = "\xE0\x9F\xBF";
         const char overlong6[] = "\xF0\x8F\xBF\xBF";
+        const char overlong4[] = { 0b11000001, 0b10111111, 0 };
+        const char overlong5[] = { 0b11100000, 0b10011111, 0b10111111, 0 };
+        const char overlong6[] = { 0b11110000, 0b10001111, 0b10111111, 0b10111111, 0 };
         size_t offset = 0;
 …
         offset = 0;
         PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong6, &offset, sizeof(overlong6)));
+}
+struct sanitize_test {
+        const char *input;
+        const char *output;
+};
+static const struct sanitize_test sanitize_tests[] = {
+        // Empty string
+        { "", "" },
+        // ASCII only
+        { "Hello, world!", "Hello, world!" },
+        // Valid multi-byte sequences
+        { "Aπ你🐱", "Aπ你🐱" },
+        // U+D7FF is last valid before surrogates
+        { "A\xED\x9F\xBFZ", "A\xED\x9F\xBFZ" },
+        // 0x10FFFF is the highest legal code point
+        { "A\xF4\x8F\xBF\xBFZ", "A\xF4\x8F\xBF\xBFZ" },
+        // Missing continuation byte
+        { "A\xC2Z", "A?Z" },
+        // Truncated multi-byte at buffer end
+        { "A\xE2\x82", "A??" },
+        // Continuation byte without leading byte (0x80-0xBF are never valid first bytes)
+        { "A\x80Y\xBFZ", "A?Y?Z" },
+        // 'A' (U+0041) normally encoded as 0x41
+        // Overlong 2-byte encoding: 0xC1 0x81
+        { "\xC1\x81X", "??X" },
+        // ¢ (U+00A2) normally encoded as 0xC2 0xA2
+        // Overlong 3-byte encoding: 0xE0 0x82 0xA2
+        { "\xE0\x82\xA2X", "???X" },
+        // ¢ (U+00A2) normally encoded as 0xC2 0xA2
+        // Overlong 4-byte encoding: 0xF0 0x80 0x82 0xA2
+        { "\xF0\x80\x82\xA2X", "????X" },
+        // € (U+20AC) normally encoded as 0xE2 0x82 0xAC
+        // Overlong 4-byte encoding: 0xF0 0x82 0x82 0xAC
+        { "\xF0\x82\x82\xACX", "????X" },
+        // Using 0xC0 0x80 as overlong encoding for NUL (which should be just 0x00)
+        { "\xC0\x80X", "??X" },
+        // 0xED 0xA0 0x80 encodes a surrogate half (U+D800), not allowed in UTF-8
+        { "A\xED\xA0\x80Z", "A???Z" },
+        // 0x110000 is not a legal code point
+        { "A\xF4\x90\x80\x80Z", "A????Z" },
+        // Mix of valid and invalid sequences
+        { "A\xC2\xA9\xE2\x28\xA1\xF0\x9F\x98\x81\x80Z", "A©?(?😁?Z" },
+};
+static size_t count_diff(const char *a, const char *b, size_t n)
+{
+        size_t count = 0;
+        for (size_t i = 0; i < n; i++) {
+                if (a[i] != b[i])
+                        count++;
+        }
+        return count;
+}
+PCUT_TEST(str_sanitize)
+{
+        char replacement = '?';
+        char buffer2[255];
+        for (size_t i = 0; i < sizeof(sanitize_tests) / sizeof(sanitize_tests[0]); i++) {
+                const char *in = sanitize_tests[i].input;
+                const char *out = sanitize_tests[i].output;
+                size_t n = str_size(in) + 1;
+                assert(str_size(out) + 1 == n);
+                memcpy(buffer, in, n);
+                size_t replaced = str_sanitize(buffer, n, replacement);
+                if (memcmp(buffer, out, n) != 0) {
+                        print_string_hex(buffer2, buffer, n);
+                        print_string_hex(buffer, out, n);
+                        PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
+                }
+                size_t expect_replaced = count_diff(buffer, in, n);
+                PCUT_ASSERT_INT_EQUALS(expect_replaced, replaced);
+        }
+        // Test with n smaller than string length - truncated valid encoding for €
+        const char *in = "ABC€";
+        const char *out = "ABC??\xAC";
+        size_t n = str_size(in) + 1;
+        memcpy(buffer, in, n);
+        size_t replaced = str_sanitize(buffer, 5, replacement);
+        if (memcmp(buffer, out, n) != 0) {
+                print_string_hex(buffer2, buffer, n);
+                print_string_hex(buffer, out, n);
+                PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
+        }
+        PCUT_ASSERT_INT_EQUALS(2, replaced);
+        char sanitized[sizeof(overlong6)];
+        str_cpy(sanitized, STR_NO_LIMIT, overlong1);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+        str_cpy(sanitized, STR_NO_LIMIT, overlong2);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+        str_cpy(sanitized, STR_NO_LIMIT, overlong3);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+        str_cpy(sanitized, STR_NO_LIMIT, overlong4);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+        str_cpy(sanitized, STR_NO_LIMIT, overlong5);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+        str_cpy(sanitized, STR_NO_LIMIT, overlong6);
+        PCUT_ASSERT_INT_EQUALS(U_SPECIAL, sanitized[0]);
+}

Context Navigation

Changes in / [1d3ae66:f444633] in mainline

Legend:

common/include/str.h

common/str.c

uspace/lib/c/arch/arm32/src/atomic.c

uspace/lib/c/test/str.c

Download in other formats: