Add Unicode tests, fix whitespace, fix docs

Added Unicode tests (and fixed few bugs), changed whitespace script to proccess markdown files, added new guide to showcase how to work with Unicode.
2025-04-06 14:11:38 +03:00
parent 6aee5a83aa
commit dd15b42b44
11 changed files with 472 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ Here is a short list of implemented features:
 - Abstraction over input/output
 - Basic data structures and algorithms (hashmap, queue, heaps, partitions)
 - Geomtric primitives (vectors, matrices, quaternions, rays, boxes)
 - Thread support functions and structures (thread, mutex, cv, atomics, etc.)
 - Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
 - String conversion from/to numbers
 For more information about currently implemented and planned features checkout
 [this page](doc/Features.md)
--- a/doc/Examples/CMakeLists.txt
+++ b/doc/Examples/CMakeLists.txt
@@ -1,3 +1,6 @@
 # PakReader
 add_executable(PakReader PakReader.c)
 target_link_libraries(PakReader BHLib)
 add_executable(Utf8Test Utf8Test.c)
 target_link_libraries(Utf8Test BHLib)
--- a/doc/Examples/UTF-8-test.txt
+++ b/doc/Examples/UTF-8-test.txt
--- a/doc/Examples/Utf8Test.c
+++ b/doc/Examples/Utf8Test.c
@@ -0,0 +1,66 @@
 #include <BH/IO.h>
 #include <BH/String.h>
 #include <stdlib.h>
 #include <stdio.h>
 void printUsage(void)
 {
    printf("Utf8Test <input> <output>\n");
    exit(1);
 }
 int main(int argc, char **argv)
 {
    BH_IO *inFile, *outFile;
    char inBuffer[8], outBuffer[8];
    uint32_t unit;
    size_t i, inSize, outSize;
    if (argc < 2)
        printUsage();
    inFile = BH_FileNew(argv[1]);
    outFile = BH_FileNew(argv[2]);
    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
        return -1;
    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
        return -1;
    inSize = 0;
    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
    {
        /* Read one byte and try to decode */
        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
        {
            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
            inSize += outSize;
            continue;
        }
        /* Remove readed amount */
        for (i = 0; i < inSize - outSize; i++)
            inBuffer[i] = inBuffer[i + outSize];
        inSize -= outSize;
        /* Change unit if incorrect and write to output */
        if (unit == -1)
            unit = 0xFFFD;
        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
        BH_IOWrite(outFile, outBuffer, outSize, NULL);
    }
    /* Incomplete UTF-8 sequence */
    if (inSize)
    {
        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
        BH_IOWrite(outFile, outBuffer, outSize, NULL);
    }
    BH_IOFree(inFile);
    BH_IOFree(outFile);
    return 0;
 }
--- a/doc/Features.md
+++ b/doc/Features.md
@@ -11,15 +11,15 @@ Currently implemented features:
 - Intersection calculation (ray, boxes, segments, lines, planes, triangles)
 - Unit testing library (for internal usage)
 - Command-line interface utilities
 - Thread support (thread, mutex, cv, atomics, etc.)
 - Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
 - String conversion functions from/to numbers
 ## Planned features
 Currently planned features:
 - Thread support (thread, mutex, cv, atomics, etc.)
 - Image loading/processing/saving support
 - Polygon rasterization (possibly canvas)
 - UTF8 string support (BMP handling)
 - Font rendering
 - GUI (Windows GUI, X11)
 - Deflate/Inflate implementation
--- a/doc/HowTo.md
+++ b/doc/HowTo.md
@@ -4,3 +4,5 @@ For the time being there is only one HowTo guide:
 - [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of
  using IO, Args and Utils modules.
 - [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics
  of using IO and String modules.
--- a/doc/HowTo/Utf8Test.md
+++ b/doc/HowTo/Utf8Test.md
@@ -0,0 +1,153 @@
 # HowTo: Transcoding UTF-8 to UTF-8
 ## Prerequisites
 We want to implement a simple command-line utility that can transcode a UTF-8
 file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences).
 To do this we would run the following command:
 ```sh
 ./Utf8Test UTF-8-test.txt UTF-8-out.txt
 ```
 ## Includes
 To implement this utility, we are going to need to include the following headers:
 - `BH/IO.h` to work with files (or input/output devices)
 - `BH/String.h` to work with UTF-8 sequences
 ## Working with Files
 Working with files in BHLib is based around the IO device (called `BH_IO`).
 Firstly, you need to create an IO device with the `BH_FileNew` function.
 Secondly, you need to open the IO device with the `BH_IOOpen` function. While
 opening the IO device, you can specify in which mode it will work: reading
 (`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether
 the IO device (or in our case, the file) should exist before opening
 (`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be
 created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`).
 Here is an example for opening an existing file in read-only mode:
 ```c
 BH_IO *io = BH_FileNew("coolfile.dat");
 if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST))
 {
    printf("Can't open file 'coolfile.dat'\n", config.file);
    BH_IOFree(io);
    return -1;
 }
 ```
 ## Working with UTF-8
 Reading UTF-8/UTF-16/UTF-32 is based around simple loop:
 1. Read bytes from input (IO or memory) to some buffer.
 2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer.
 3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD.
 Writing UTF-8/UTF-16/UTF-32 is straight forward:
 1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range).
 2. Write data (to IO or memory).
 BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)
 ```c
 while (...)
 {
    /* Read one byte and try to decode */
    if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
    {
        BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
        inSize += outSize;
        continue;
    }
    /* Remove readed amount */
    for (i = 0; i < inSize - outSize; i++)
        inBuffer[i] = inBuffer[i + outSize];
    inSize -= outSize;
    /* Change unit if incorrect and write to output */
    if (unit == -1)
        unit = 0xFFFD;
    outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
    BH_IOWrite(outFile, outBuffer, outSize, NULL);
 }
 ```
 ## Putting Everything Together
 ```c
 #include <BH/IO.h>
 #include <BH/String.h>
 #include <stdlib.h>
 #include <stdio.h>
 void printUsage(void)
 {
    printf("Utf8Test <input> <output>\n");
    exit(1);
 }
 int main(int argc, char **argv)
 {
    BH_IO *inFile, *outFile;
    char inBuffer[8], outBuffer[8];
    uint32_t unit;
    size_t i, inSize, outSize;
    if (argc < 2)
        printUsage();
    inFile = BH_FileNew(argv[1]);
    outFile = BH_FileNew(argv[2]);
    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
        return -1;
    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
        return -1;
    inSize = 0;
    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
    {
        /* Read one byte and try to decode */
        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
        {
            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
            inSize += outSize;
            continue;
        }
        /* Remove readed amount */
        for (i = 0; i < inSize - outSize; i++)
            inBuffer[i] = inBuffer[i + outSize];
        inSize -= outSize;
        /* Change unit if incorrect and write to output */
        if (unit == -1)
            unit = 0xFFFD;
        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
        BH_IOWrite(outFile, outBuffer, outSize, NULL);
    }
    /* Incomplete UTF-8 sequence */
    if (inSize)
    {
        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
        BH_IOWrite(outFile, outBuffer, outSize, NULL);
    }
    BH_IOFree(inFile);
    BH_IOFree(outFile);
    return 0;
 }
 ```
--- a/src/String/Unicode.c
+++ b/src/String/Unicode.c
@@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit,
 {
    size_t result;
    if (unit > 0xD7FF && unit < 0xE000)
        return 0;
    result = 0;
    if (unit < 0x80ul)
    {
@@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string,
 size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
                               char *string)
 {
    if (unit > 0xD7FF && unit < 0xE000)
        return 0;
    if (unit < 0x10000)
    {
        BH_Write16LEu(string, unit);
@@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
                               char *string)
 {
    if (unit > 0xD7FF && unit < 0xE000)
        return 0;
    if (unit < 0x10000)
    {
        BH_Write16BEu(string, unit);
@@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string,
 size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
                               char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
        return 0;
    BH_Write32LEu(string, unit);
@@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
                               char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
        return 0;
    BH_Write32BEu(string, unit);
--- a/test/src/TestUnicode.c
+++ b/test/src/TestUnicode.c
@@ -0,0 +1,225 @@
 #include <BH/Unit.h>
 #include <BH/String.h>
 #include <BH/IO.h>
 #include <stdlib.h>
 struct TestCase
 {
    char *input;
    size_t size;
    size_t read;
    uint32_t result;
 };
 BH_UNIT_TEST(Case)
 {
    size_t i, j;
    for (i = 0; i < 0x110000; i++)
    {
        j = BH_UnicodeLower(i);
        if (j == i)
        {
            j = BH_UnicodeUpper(i);
            j = BH_UnicodeLower(j);
        }
        else
            j = BH_UnicodeUpper(j);
        /* Some exceptions */
        if (i == 0x130 && j == 0x49)
            continue;
        else if (i == 0x131 && j == 0x69)
            continue;
        else if (i == 0x1C5 && j == 0x1C4)
            continue;
        else if (i == 0x1C8 && j == 0x1C7)
            continue;
        else if (i == 0x1CB && j == 0x1CA)
            continue;
        BH_VERIFY(i == j);
    }
    return 0;
 }
 BH_UNIT_TEST(Utf8)
 {
    const struct TestCase *current;
    const struct TestCase cases[] =
    {
        /* Normal cases */
        {"\x00", 1, 1, 0},
        {"\xC2\x80", 2, 2, 0x80},
        {"\xE0\xA0\x80", 3, 3, 0x800},
        {"\xF0\x90\x80\x80", 4, 4, 0x10000},
        {"\x7F", 1, 1, 0x7F},
        {"\xDF\xBF", 2, 2, 0x7FF},
        {"\xEF\xBF\xBF", 3, 3, 0xFFFF},
        {"\xED\x9F\xBF", 3, 3, 0xD7FF},
        {"\xEE\x80\x80", 3, 3, 0xE000},
        {"\xEF\xBF\xBD", 3, 3, 0xFFFD},
        {"H", 1, 1, 'H'},
        {"\xCE\xBA", 2, 2, 0x3BA},
        /* Lonely start characters */
        {"\xC0 ", 2, 1, -1},
        {"\xC1 ", 2, 1, -1},
        {"\xC2 ", 2, 1, -1},
        {"\xC3 ", 2, 1, -1},
        {"\xC4 ", 2, 1, -1},
        /* Malformed sequences */
        {"\x80", 1, -1, -1},
        {"\xBF", 1, -1, -1},
        {"\xFE", 1, -1, -1},
        {"\xFF", 1, -1, -1},
        /* Overlong sequences */
        {"\xC0\xAF", 2, 1, -1},
        /* UTF-16 surrogate pairs */
        {"\xED\xA0\x80", 3, -1, -1},
        {"\xED\xAE\x80", 3, -1, -1},
        {"\xED\xB0\x80", 3, -1, -1},
        {"\xED\xBF\xBF", 3, -1, -1},
        {NULL, 0, 0, 0}
    };
    char buffer[8];
    uint32_t unit;
    size_t i, outSize, inSize;
    /* Encode and decode all character in a valid UTF-8 range */
    for (i = 0; i < 0x110000; i++)
    {
        inSize = BH_UnicodeEncodeUtf8(i, buffer);
        /* Check for surrogate pairs */
        if (i > 0xD7FF && i < 0xE000)
        {
            BH_VERIFY(inSize == 0);
            continue;
        }
        BH_VERIFY(inSize > 0);
        outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit);
        BH_VERIFY(inSize == outSize);
        BH_VERIFY(unit == i);
    }
    /* Test special cases */
    for (current = cases; current->input; current++)
    {
        i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit);
        if (current->read == (size_t)-1 && i)
            i = -1;
        if (i != current->read || unit != current->result)
        {
            printf("\tcase %d\n", (int)(current - cases));
            BH_VERIFY(i == current->read);
            BH_VERIFY(unit == current->result);
        }
    }
    return 0;
 }
 BH_UNIT_TEST(Utf16)
 {
    char buffer[8];
    uint32_t unit;
    size_t i, outSize, inSize;
    /* Encode and decode all character in a valid UTF-8 range */
    for (i = 0; i < 0x110000; i++)
    {
        /* Check for little endian */
        inSize = BH_UnicodeEncodeUtf16LE(i, buffer);
        if (i > 0xD7FF && i < 0xE000)
        {
            BH_VERIFY(inSize == 0);
            continue;
        }
        BH_VERIFY(inSize > 0);
        outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit);
        BH_VERIFY(inSize == outSize);
        BH_VERIFY(unit == i);
        /* Check for big endian */
        inSize = BH_UnicodeEncodeUtf16BE(i, buffer);
        if (i > 0xD7FF && i < 0xE000)
        {
            BH_VERIFY(inSize == 0);
            continue;
        }
        BH_VERIFY(inSize > 0);
        outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit);
        BH_VERIFY(inSize == outSize);
        BH_VERIFY(unit == i);
    }
    return 0;
 }
 BH_UNIT_TEST(Utf32)
 {
    char buffer[8];
    uint32_t unit;
    size_t i, outSize, inSize;
    /* Encode and decode all character in a valid UTF-8 range */
    for (i = 0; i < 0x110000; i++)
    {
        /* Check for little endian */
        inSize = BH_UnicodeEncodeUtf32LE(i, buffer);
        if (i > 0xD7FF && i < 0xE000)
        {
            BH_VERIFY(inSize == 0);
            continue;
        }
        BH_VERIFY(inSize > 0);
        outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit);
        BH_VERIFY(inSize == outSize);
        BH_VERIFY(unit == i);
        /* Check for big endian */
        inSize = BH_UnicodeEncodeUtf32BE(i, buffer);
        if (i > 0xD7FF && i < 0xE000)
        {
            BH_VERIFY(inSize == 0);
            continue;
        }
        BH_VERIFY(inSize > 0);
        outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit);
        BH_VERIFY(inSize == outSize);
        BH_VERIFY(unit == i);
    }
    return 0;
 }
 int main(int argc, char **argv)
 {
    BH_UNUSED(argc);
    BH_UNUSED(argv);
    BH_UNIT_ADD(Case);
    BH_UNIT_ADD(Utf8);
    BH_UNIT_ADD(Utf16);
    BH_UNIT_ADD(Utf32);
    return BH_UnitRun();
 }
--- a/util/whitespace.sh
+++ b/util/whitespace.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \;
+find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \;
 find . -iname "*.bak" -exec rm {} \;