Add Unicode tests, fix whitespace, fix docs

Added Unicode tests (and fixed few bugs), changed whitespace script to proccess markdown files, added new guide to showcase how to work with Unicode.
author: Mikhail Romanko <me@blankhex.com> 2025-04-06 14:11:38 +0300
committer: Mikhail Romanko <me@blankhex.com> 2025-04-06 14:11:38 +0300
commit: dd15b42b447a2f668849f38991eeed71594bb395 (patch)
tree: 4023bdf7e1c1275c6574071d3b536ecf3cf6d1f2
parent: 6aee5a83aa009c5e2cd5be5278c0b3b1fdb1325d (diff)
download: bhlib-dd15b42b447a2f668849f38991eeed71594bb395.tar.gz
11 files changed, 472 insertions, 11 deletions
diff --git a/README.md b/README.md
index f283547..7560ae6 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ Here is a short list of implemented features:
 - Abstraction over input/output
 - Basic data structures and algorithms (hashmap, queue, heaps, partitions)
 - Geomtric primitives (vectors, matrices, quaternions, rays, boxes)
+- Thread support functions and structures (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion from/to numbers
 
 For more information about currently implemented and planned features checkout
 [this page](doc/Features.md)
diff --git a/doc/Examples/CMakeLists.txt b/doc/Examples/CMakeLists.txt
index 7adcf05..5a31dc3 100644
--- a/doc/Examples/CMakeLists.txt
+++ b/doc/Examples/CMakeLists.txt
@@ -1,3 +1,6 @@
 # PakReader
 add_executable(PakReader PakReader.c)
-target_link_libraries(PakReader BHLib)
-\ No newline at end of file
+target_link_libraries(PakReader BHLib)
+
+add_executable(Utf8Test Utf8Test.c)
+target_link_libraries(Utf8Test BHLib)
+\ No newline at end of file
diff --git a/doc/Examples/UTF-8-test.txt b/doc/Examples/UTF-8-test.txt
new file mode 100644
index 0000000..a5b5d50
--- /dev/null
+++ b/doc/Examples/UTF-8-test.txt
diff --git a/doc/Examples/Utf8Test.c b/doc/Examples/Utf8Test.c
new file mode 100644
index 0000000..e6d9e56
--- /dev/null
+++ b/doc/Examples/Utf8Test.c
@@ -0,0 +1,66 @@
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+    printf("Utf8Test <input> <output>\n");
+    exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_IO *inFile, *outFile;
+    char inBuffer[8], outBuffer[8];
+    uint32_t unit;
+    size_t i, inSize, outSize;
+
+    if (argc < 2)
+        printUsage();
+
+    inFile = BH_FileNew(argv[1]);
+    outFile = BH_FileNew(argv[2]);
+
+    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+        return -1;
+
+    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+        return -1;
+
+    inSize = 0;
+    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+    {
+        /* Read one byte and try to decode */
+        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+        {
+            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+            inSize += outSize;
+            continue;
+        }
+
+        /* Remove readed amount */
+        for (i = 0; i < inSize - outSize; i++)
+            inBuffer[i] = inBuffer[i + outSize];
+        inSize -= outSize;
+
+        /* Change unit if incorrect and write to output */
+        if (unit == -1)
+            unit = 0xFFFD;
+        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    /* Incomplete UTF-8 sequence */
+    if (inSize)
+    {
+        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    BH_IOFree(inFile);
+    BH_IOFree(outFile);
+    return 0;
+}
diff --git a/doc/Features.md b/doc/Features.md
index 8dd7926..31da745 100644
--- a/doc/Features.md
+++ b/doc/Features.md
@@ -11,15 +11,15 @@ Currently implemented features:
 - Intersection calculation (ray, boxes, segments, lines, planes, triangles)
 - Unit testing library (for internal usage)
 - Command-line interface utilities
+- Thread support (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion functions from/to numbers
 
 ## Planned features
 
 Currently planned features:
-
-- Thread support (thread, mutex, cv, atomics, etc.)
 - Image loading/processing/saving support
 - Polygon rasterization (possibly canvas)
-- UTF8 string support (BMP handling)
 - Font rendering
 - GUI (Windows GUI, X11)
 - Deflate/Inflate implementation
diff --git a/doc/HowTo.md b/doc/HowTo.md
index 37dd9a6..8fcafd5 100644
--- a/doc/HowTo.md
+++ b/doc/HowTo.md
@@ -4,3 +4,5 @@ For the time being there is only one HowTo guide:
 
 - [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of
   using IO, Args and Utils modules.
+- [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics
+  of using IO and String modules.
diff --git a/doc/HowTo/PakReader.md b/doc/HowTo/PakReader.md
index e7071c3..4f75399 100644
--- a/doc/HowTo/PakReader.md
+++ b/doc/HowTo/PakReader.md
@@ -323,7 +323,7 @@ static int CopyData(BH_IO *from,
 
         if (BH_IORead(from, tmp, length, &actual) || length != actual)
             return BH_ERROR;
-        
+
         if (BH_IOWrite(to, tmp, length, &actual) || length != actual)
             return BH_ERROR;
     }
@@ -344,7 +344,7 @@ static int ProcessPack(Config *config,
     /* Read header and seek to begging of the file table */
     if (ParseHeader(io, &header))
         return BH_ERROR;
-    
+
     if (BH_IOSeek(io, header.offset, BH_IO_SEEK_SET))
         return BH_ERROR;
 
@@ -362,7 +362,7 @@ static int ProcessPack(Config *config,
                 continue;
 
             output = BH_FileNew(config->output);
-            if (BH_IOOpen(output, BH_IO_WRITE) || 
+            if (BH_IOOpen(output, BH_IO_WRITE) ||
                 BH_IOSeek(io, entry.offset, BH_IO_SEEK_SET) ||
                 CopyData(io, output, entry.size))
             {
@@ -374,7 +374,7 @@ static int ProcessPack(Config *config,
             return BH_OK;
         }
     }
-    
+
     if (config->list)
         return BH_OK;
     return BH_ERROR;
diff --git a/doc/HowTo/Utf8Test.md b/doc/HowTo/Utf8Test.md
new file mode 100644
index 0000000..7e82ef0
--- /dev/null
+++ b/doc/HowTo/Utf8Test.md
@@ -0,0 +1,153 @@
+# HowTo: Transcoding UTF-8 to UTF-8
+
+## Prerequisites
+
+We want to implement a simple command-line utility that can transcode a UTF-8
+file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences).
+
+To do this we would run the following command:
+
+```sh
+./Utf8Test UTF-8-test.txt UTF-8-out.txt
+```
+
+## Includes
+
+To implement this utility, we are going to need to include the following headers:
+
+- `BH/IO.h` to work with files (or input/output devices)
+- `BH/String.h` to work with UTF-8 sequences
+
+## Working with Files
+
+Working with files in BHLib is based around the IO device (called `BH_IO`).
+Firstly, you need to create an IO device with the `BH_FileNew` function.
+Secondly, you need to open the IO device with the `BH_IOOpen` function. While
+opening the IO device, you can specify in which mode it will work: reading
+(`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether
+the IO device (or in our case, the file) should exist before opening
+(`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be
+created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`).
+
+Here is an example for opening an existing file in read-only mode:
+
+```c
+BH_IO *io = BH_FileNew("coolfile.dat");
+if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST))
+{
+    printf("Can't open file 'coolfile.dat'\n", config.file);
+    BH_IOFree(io);
+    return -1;
+}
+```
+
+## Working with UTF-8
+
+Reading UTF-8/UTF-16/UTF-32 is based around simple loop:
+
+1. Read bytes from input (IO or memory) to some buffer.
+2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer.
+3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD.
+
+Writing UTF-8/UTF-16/UTF-32 is straight forward:
+
+1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range).
+2. Write data (to IO or memory).
+
+BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)
+
+```c
+
+while (...)
+{
+    /* Read one byte and try to decode */
+    if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+    {
+        BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+        inSize += outSize;
+        continue;
+    }
+
+    /* Remove readed amount */
+    for (i = 0; i < inSize - outSize; i++)
+        inBuffer[i] = inBuffer[i + outSize];
+    inSize -= outSize;
+
+    /* Change unit if incorrect and write to output */
+    if (unit == -1)
+        unit = 0xFFFD;
+    outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+    BH_IOWrite(outFile, outBuffer, outSize, NULL);
+}
+```
+
+## Putting Everything Together
+
+```c
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+    printf("Utf8Test <input> <output>\n");
+    exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_IO *inFile, *outFile;
+    char inBuffer[8], outBuffer[8];
+    uint32_t unit;
+    size_t i, inSize, outSize;
+
+    if (argc < 2)
+        printUsage();
+
+    inFile = BH_FileNew(argv[1]);
+    outFile = BH_FileNew(argv[2]);
+
+    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+        return -1;
+
+    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+        return -1;
+
+    inSize = 0;
+    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+    {
+        /* Read one byte and try to decode */
+        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+        {
+            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+            inSize += outSize;
+            continue;
+        }
+
+        /* Remove readed amount */
+        for (i = 0; i < inSize - outSize; i++)
+            inBuffer[i] = inBuffer[i + outSize];
+        inSize -= outSize;
+
+        /* Change unit if incorrect and write to output */
+        if (unit == -1)
+            unit = 0xFFFD;
+        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    /* Incomplete UTF-8 sequence */
+    if (inSize)
+    {
+        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    BH_IOFree(inFile);
+    BH_IOFree(outFile);
+    return 0;
+}
+```
diff --git a/src/String/Unicode.c b/src/String/Unicode.c
index 1f0eaf0..26b9670 100644
--- a/src/String/Unicode.c
+++ b/src/String/Unicode.c
@@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit,
 {
     size_t result;
 
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     result = 0;
     if (unit < 0x80ul)
     {
@@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string,
 size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
                                char *string)
 {
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     if (unit < 0x10000)
     {
         BH_Write16LEu(string, unit);
@@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
                                char *string)
 {
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     if (unit < 0x10000)
     {
         BH_Write16BEu(string, unit);
@@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string,
 size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
                                char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
         return 0;
 
     BH_Write32LEu(string, unit);
@@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
                                char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
         return 0;
 
     BH_Write32BEu(string, unit);
diff --git a/test/src/TestUnicode.c b/test/src/TestUnicode.c
new file mode 100644
index 0000000..8ee23b7
--- /dev/null
+++ b/test/src/TestUnicode.c
@@ -0,0 +1,225 @@
+#include <BH/Unit.h>
+#include <BH/String.h>
+#include <BH/IO.h>
+#include <stdlib.h>
+
+
+struct TestCase
+{
+    char *input;
+    size_t size;
+    size_t read;
+    uint32_t result;
+};
+
+
+BH_UNIT_TEST(Case)
+{
+    size_t i, j;
+
+    for (i = 0; i < 0x110000; i++)
+    {
+        j = BH_UnicodeLower(i);
+        if (j == i)
+        {
+            j = BH_UnicodeUpper(i);
+            j = BH_UnicodeLower(j);
+        }
+        else
+            j = BH_UnicodeUpper(j);
+
+        /* Some exceptions */
+        if (i == 0x130 && j == 0x49)
+            continue;
+        else if (i == 0x131 && j == 0x69)
+            continue;
+        else if (i == 0x1C5 && j == 0x1C4)
+            continue;
+        else if (i == 0x1C8 && j == 0x1C7)
+            continue;
+        else if (i == 0x1CB && j == 0x1CA)
+            continue;
+
+        BH_VERIFY(i == j);
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf8)
+{
+    const struct TestCase *current;
+    const struct TestCase cases[] =
+    {
+        /* Normal cases */
+        {"\x00", 1, 1, 0},
+        {"\xC2\x80", 2, 2, 0x80},
+        {"\xE0\xA0\x80", 3, 3, 0x800},
+        {"\xF0\x90\x80\x80", 4, 4, 0x10000},
+        {"\x7F", 1, 1, 0x7F},
+        {"\xDF\xBF", 2, 2, 0x7FF},
+        {"\xEF\xBF\xBF", 3, 3, 0xFFFF},
+        {"\xED\x9F\xBF", 3, 3, 0xD7FF},
+        {"\xEE\x80\x80", 3, 3, 0xE000},
+        {"\xEF\xBF\xBD", 3, 3, 0xFFFD},
+        {"H", 1, 1, 'H'},
+        {"\xCE\xBA", 2, 2, 0x3BA},
+
+        /* Lonely start characters */
+        {"\xC0 ", 2, 1, -1},
+        {"\xC1 ", 2, 1, -1},
+        {"\xC2 ", 2, 1, -1},
+        {"\xC3 ", 2, 1, -1},
+        {"\xC4 ", 2, 1, -1},
+
+        /* Malformed sequences */
+        {"\x80", 1, -1, -1},
+        {"\xBF", 1, -1, -1},
+        {"\xFE", 1, -1, -1},
+        {"\xFF", 1, -1, -1},
+
+        /* Overlong sequences */
+        {"\xC0\xAF", 2, 1, -1},
+
+        /* UTF-16 surrogate pairs */
+        {"\xED\xA0\x80", 3, -1, -1},
+        {"\xED\xAE\x80", 3, -1, -1},
+        {"\xED\xB0\x80", 3, -1, -1},
+        {"\xED\xBF\xBF", 3, -1, -1},
+
+        {NULL, 0, 0, 0}
+    };
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        inSize = BH_UnicodeEncodeUtf8(i, buffer);
+
+        /* Check for surrogate pairs */
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+
+        BH_VERIFY(inSize > 0);
+        outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    /* Test special cases */
+    for (current = cases; current->input; current++)
+    {
+        i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit);
+        if (current->read == (size_t)-1 && i)
+            i = -1;
+
+        if (i != current->read || unit != current->result)
+        {
+            printf("\tcase %d\n", (int)(current - cases));
+            BH_VERIFY(i == current->read);
+            BH_VERIFY(unit == current->result);
+        }
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf16)
+{
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        /* Check for little endian */
+        inSize = BH_UnicodeEncodeUtf16LE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+
+        /* Check for big endian */
+        inSize = BH_UnicodeEncodeUtf16BE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf32)
+{
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        /* Check for little endian */
+        inSize = BH_UnicodeEncodeUtf32LE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+
+        /* Check for big endian */
+        inSize = BH_UnicodeEncodeUtf32BE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_UNUSED(argc);
+    BH_UNUSED(argv);
+
+    BH_UNIT_ADD(Case);
+    BH_UNIT_ADD(Utf8);
+    BH_UNIT_ADD(Utf16);
+    BH_UNIT_ADD(Utf32);
+
+    return BH_UnitRun();
+}
diff --git a/util/whitespace.sh b/util/whitespace.sh
index f661aa3..325683e 100755
--- a/util/whitespace.sh
+++ b/util/whitespace.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \;
+find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \;
 find . -iname "*.bak" -exec rm {} \;
author	Mikhail Romanko <me@blankhex.com>	2025-04-06 14:11:38 +0300
committer	Mikhail Romanko <me@blankhex.com>	2025-04-06 14:11:38 +0300
commit	dd15b42b447a2f668849f38991eeed71594bb395 (patch)
tree	4023bdf7e1c1275c6574071d3b536ecf3cf6d1f2
parent	6aee5a83aa009c5e2cd5be5278c0b3b1fdb1325d (diff)
download	bhlib-dd15b42b447a2f668849f38991eeed71594bb395.tar.gz