11 files changed, 472 insertions, 11 deletions
diff --git a/README.md b/README.md
index f283547..7560ae6 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ Here is a short list of implemented features:
 - Abstraction over input/output
 - Basic data structures and algorithms (hashmap, queue, heaps, partitions)
 - Geomtric primitives (vectors, matrices, quaternions, rays, boxes)
+- Thread support functions and structures (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion from/to numbers
 
 For more information about currently implemented and planned features checkout
 [this page](doc/Features.md)
diff --git a/doc/Examples/CMakeLists.txt b/doc/Examples/CMakeLists.txt
index 7adcf05..5a31dc3 100644
--- a/doc/Examples/CMakeLists.txt
+++ b/doc/Examples/CMakeLists.txt
@@ -1,3 +1,6 @@
 # PakReader
 add_executable(PakReader PakReader.c)
-target_link_libraries(PakReader BHLib)
-\ No newline at end of file
+target_link_libraries(PakReader BHLib)
+
+add_executable(Utf8Test Utf8Test.c)
+target_link_libraries(Utf8Test BHLib)
+\ No newline at end of file
diff --git a/doc/Examples/UTF-8-test.txt b/doc/Examples/UTF-8-test.txt
new file mode 100644
index 0000000..a5b5d50
--- /dev/null
+++ b/doc/Examples/UTF-8-test.txt
diff --git a/doc/Examples/Utf8Test.c b/doc/Examples/Utf8Test.c
new file mode 100644
index 0000000..e6d9e56
--- /dev/null
+++ b/doc/Examples/Utf8Test.c
@@ -0,0 +1,66 @@
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+    printf("Utf8Test <input> <output>\n");
+    exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_IO *inFile, *outFile;
+    char inBuffer[8], outBuffer[8];
+    uint32_t unit;
+    size_t i, inSize, outSize;
+
+    if (argc < 2)
+        printUsage();
+
+    inFile = BH_FileNew(argv[1]);
+    outFile = BH_FileNew(argv[2]);
+
+    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+        return -1;
+
+    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+        return -1;
+
+    inSize = 0;
+    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+    {
+        /* Read one byte and try to decode */
+        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+        {
+            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+            inSize += outSize;
+            continue;
+        }
+
+        /* Remove readed amount */
+        for (i = 0; i < inSize - outSize; i++)
+            inBuffer[i] = inBuffer[i + outSize];
+        inSize -= outSize;
+
+        /* Change unit if incorrect and write to output */
+        if (unit == -1)
+            unit = 0xFFFD;
+        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    /* Incomplete UTF-8 sequence */
+    if (inSize)
+    {
+        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    BH_IOFree(inFile);
+    BH_IOFree(outFile);
+    return 0;
+}
diff --git a/doc/Features.md b/doc/Features.md
index 8dd7926..31da745 100644
--- a/doc/Features.md
+++ b/doc/Features.md
@@ -11,15 +11,15 @@ Currently implemented features:
 - Intersection calculation (ray, boxes, segments, lines, planes, triangles)
 - Unit testing library (for internal usage)
 - Command-line interface utilities
+- Thread support (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion functions from/to numbers
 
 ## Planned features
 
 Currently planned features:
-
-- Thread support (thread, mutex, cv, atomics, etc.)
 - Image loading/processing/saving support
 - Polygon rasterization (possibly canvas)
-- UTF8 string support (BMP handling)
 - Font rendering
 - GUI (Windows GUI, X11)
 - Deflate/Inflate implementation
diff --git a/doc/HowTo.md b/doc/HowTo.md
index 37dd9a6..8fcafd5 100644
--- a/doc/HowTo.md
+++ b/doc/HowTo.md
@@ -4,3 +4,5 @@ For the time being there is only one HowTo guide:
 
 - [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of
   using IO, Args and Utils modules.
+- [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics
+  of using IO and String modules.
diff --git a/doc/HowTo/PakReader.md b/doc/HowTo/PakReader.md
index e7071c3..4f75399 100644
--- a/doc/HowTo/PakReader.md
+++ b/doc/HowTo/PakReader.md
@@ -323,7 +323,7 @@ static int CopyData(BH_IO *from,
 
         if (BH_IORead(from, tmp, length, &actual) || length != actual)
             return BH_ERROR;
-        
+
         if (BH_IOWrite(to, tmp, length, &actual) || length != actual)
             return BH_ERROR;
     }
@@ -344,7 +344,7 @@ static int ProcessPack(Config *config,
     /* Read header and seek to begging of the file table */
     if (ParseHeader(io, &header))
         return BH_ERROR;
-    
+
     if (BH_IOSeek(io, header.offset, BH_IO_SEEK_SET))
         return BH_ERROR;
 
@@ -362,7 +362,7 @@ static int ProcessPack(Config *config,
                 continue;
 
             output = BH_FileNew(config->output);
-            if (BH_IOOpen(output, BH_IO_WRITE) || 
+            if (BH_IOOpen(output, BH_IO_WRITE) ||
                 BH_IOSeek(io, entry.offset, BH_IO_SEEK_SET) ||
                 CopyData(io, output, entry.size))
             {
@@ -374,7 +374,7 @@ static int ProcessPack(Config *config,
             return BH_OK;
         }
     }
-    
+
     if (config->list)
         return BH_OK;
     return BH_ERROR;
diff --git a/doc/HowTo/Utf8Test.md b/doc/HowTo/Utf8Test.md
new file mode 100644
index 0000000..7e82ef0
--- /dev/null
+++ b/doc/HowTo/Utf8Test.md
@@ -0,0 +1,153 @@
+# HowTo: Transcoding UTF-8 to UTF-8
+
+## Prerequisites
+
+We want to implement a simple command-line utility that can transcode a UTF-8
+file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences).
+
+To do this we would run the following command:
+
+```sh
+./Utf8Test UTF-8-test.txt UTF-8-out.txt
+```
+
+## Includes
+
+To implement this utility, we are going to need to include the following headers:
+
+- `BH/IO.h` to work with files (or input/output devices)
+- `BH/String.h` to work with UTF-8 sequences
+
+## Working with Files
+
+Working with files in BHLib is based around the IO device (called `BH_IO`).
+Firstly, you need to create an IO device with the `BH_FileNew` function.
+Secondly, you need to open the IO device with the `BH_IOOpen` function. While
+opening the IO device, you can specify in which mode it will work: reading
+(`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether
+the IO device (or in our case, the file) should exist before opening
+(`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be
+created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`).
+
+Here is an example for opening an existing file in read-only mode:
+
+```c
+BH_IO *io = BH_FileNew("coolfile.dat");
+if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST))
+{
+    printf("Can't open file 'coolfile.dat'\n", config.file);
+    BH_IOFree(io);
+    return -1;
+}
+```
+
+## Working with UTF-8
+
+Reading UTF-8/UTF-16/UTF-32 is based around simple loop:
+
+1. Read bytes from input (IO or memory) to some buffer.
+2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer.
+3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD.
+
+Writing UTF-8/UTF-16/UTF-32 is straight forward:
+
+1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range).
+2. Write data (to IO or memory).
+
+BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)
+
+```c
+
+while (...)
+{
+    /* Read one byte and try to decode */
+    if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+    {
+        BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+        inSize += outSize;
+        continue;
+    }
+
+    /* Remove readed amount */
+    for (i = 0; i < inSize - outSize; i++)
+        inBuffer[i] = inBuffer[i + outSize];
+    inSize -= outSize;
+
+    /* Change unit if incorrect and write to output */
+    if (unit == -1)
+        unit = 0xFFFD;
+    outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+    BH_IOWrite(outFile, outBuffer, outSize, NULL);
+}
+```
+
+## Putting Everything Together
+
+```c
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+    printf("Utf8Test <input> <output>\n");
+    exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_IO *inFile, *outFile;
+    char inBuffer[8], outBuffer[8];
+    uint32_t unit;
+    size_t i, inSize, outSize;
+
+    if (argc < 2)
+        printUsage();
+
+    inFile = BH_FileNew(argv[1]);
+    outFile = BH_FileNew(argv[2]);
+
+    if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+        return -1;
+
+    if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+        return -1;
+
+    inSize = 0;
+    while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+    {
+        /* Read one byte and try to decode */
+        if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+        {
+            BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+            inSize += outSize;
+            continue;
+        }
+
+        /* Remove readed amount */
+        for (i = 0; i < inSize - outSize; i++)
+            inBuffer[i] = inBuffer[i + outSize];
+        inSize -= outSize;
+
+        /* Change unit if incorrect and write to output */
+        if (unit == -1)
+            unit = 0xFFFD;
+        outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    /* Incomplete UTF-8 sequence */
+    if (inSize)
+    {
+        outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+        BH_IOWrite(outFile, outBuffer, outSize, NULL);
+    }
+
+    BH_IOFree(inFile);
+    BH_IOFree(outFile);
+    return 0;
+}
+```
diff --git a/src/String/Unicode.c b/src/String/Unicode.c
index 1f0eaf0..26b9670 100644
--- a/src/String/Unicode.c
+++ b/src/String/Unicode.c
@@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit,
 {
     size_t result;
 
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     result = 0;
     if (unit < 0x80ul)
     {
@@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string,
 size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
                                char *string)
 {
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     if (unit < 0x10000)
     {
         BH_Write16LEu(string, unit);
@@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
                                char *string)
 {
+    if (unit > 0xD7FF && unit < 0xE000)
+        return 0;
+
     if (unit < 0x10000)
     {
         BH_Write16BEu(string, unit);
@@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string,
 size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
                                char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
         return 0;
 
     BH_Write32LEu(string, unit);
@@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
 size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
                                char *string)
 {
-    if (unit > 0x1FFFFF)
+    if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
         return 0;
 
     BH_Write32BEu(string, unit);
diff --git a/test/src/TestUnicode.c b/test/src/TestUnicode.c
new file mode 100644
index 0000000..8ee23b7
--- /dev/null
+++ b/test/src/TestUnicode.c
@@ -0,0 +1,225 @@
+#include <BH/Unit.h>
+#include <BH/String.h>
+#include <BH/IO.h>
+#include <stdlib.h>
+
+
+struct TestCase
+{
+    char *input;
+    size_t size;
+    size_t read;
+    uint32_t result;
+};
+
+
+BH_UNIT_TEST(Case)
+{
+    size_t i, j;
+
+    for (i = 0; i < 0x110000; i++)
+    {
+        j = BH_UnicodeLower(i);
+        if (j == i)
+        {
+            j = BH_UnicodeUpper(i);
+            j = BH_UnicodeLower(j);
+        }
+        else
+            j = BH_UnicodeUpper(j);
+
+        /* Some exceptions */
+        if (i == 0x130 && j == 0x49)
+            continue;
+        else if (i == 0x131 && j == 0x69)
+            continue;
+        else if (i == 0x1C5 && j == 0x1C4)
+            continue;
+        else if (i == 0x1C8 && j == 0x1C7)
+            continue;
+        else if (i == 0x1CB && j == 0x1CA)
+            continue;
+
+        BH_VERIFY(i == j);
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf8)
+{
+    const struct TestCase *current;
+    const struct TestCase cases[] =
+    {
+        /* Normal cases */
+        {"\x00", 1, 1, 0},
+        {"\xC2\x80", 2, 2, 0x80},
+        {"\xE0\xA0\x80", 3, 3, 0x800},
+        {"\xF0\x90\x80\x80", 4, 4, 0x10000},
+        {"\x7F", 1, 1, 0x7F},
+        {"\xDF\xBF", 2, 2, 0x7FF},
+        {"\xEF\xBF\xBF", 3, 3, 0xFFFF},
+        {"\xED\x9F\xBF", 3, 3, 0xD7FF},
+        {"\xEE\x80\x80", 3, 3, 0xE000},
+        {"\xEF\xBF\xBD", 3, 3, 0xFFFD},
+        {"H", 1, 1, 'H'},
+        {"\xCE\xBA", 2, 2, 0x3BA},
+
+        /* Lonely start characters */
+        {"\xC0 ", 2, 1, -1},
+        {"\xC1 ", 2, 1, -1},
+        {"\xC2 ", 2, 1, -1},
+        {"\xC3 ", 2, 1, -1},
+        {"\xC4 ", 2, 1, -1},
+
+        /* Malformed sequences */
+        {"\x80", 1, -1, -1},
+        {"\xBF", 1, -1, -1},
+        {"\xFE", 1, -1, -1},
+        {"\xFF", 1, -1, -1},
+
+        /* Overlong sequences */
+        {"\xC0\xAF", 2, 1, -1},
+
+        /* UTF-16 surrogate pairs */
+        {"\xED\xA0\x80", 3, -1, -1},
+        {"\xED\xAE\x80", 3, -1, -1},
+        {"\xED\xB0\x80", 3, -1, -1},
+        {"\xED\xBF\xBF", 3, -1, -1},
+
+        {NULL, 0, 0, 0}
+    };
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        inSize = BH_UnicodeEncodeUtf8(i, buffer);
+
+        /* Check for surrogate pairs */
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+
+        BH_VERIFY(inSize > 0);
+        outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    /* Test special cases */
+    for (current = cases; current->input; current++)
+    {
+        i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit);
+        if (current->read == (size_t)-1 && i)
+            i = -1;
+
+        if (i != current->read || unit != current->result)
+        {
+            printf("\tcase %d\n", (int)(current - cases));
+            BH_VERIFY(i == current->read);
+            BH_VERIFY(unit == current->result);
+        }
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf16)
+{
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        /* Check for little endian */
+        inSize = BH_UnicodeEncodeUtf16LE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+
+        /* Check for big endian */
+        inSize = BH_UnicodeEncodeUtf16BE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    return 0;
+}
+
+
+BH_UNIT_TEST(Utf32)
+{
+    char buffer[8];
+    uint32_t unit;
+    size_t i, outSize, inSize;
+
+    /* Encode and decode all character in a valid UTF-8 range */
+    for (i = 0; i < 0x110000; i++)
+    {
+        /* Check for little endian */
+        inSize = BH_UnicodeEncodeUtf32LE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+
+        /* Check for big endian */
+        inSize = BH_UnicodeEncodeUtf32BE(i, buffer);
+        if (i > 0xD7FF && i < 0xE000)
+        {
+            BH_VERIFY(inSize == 0);
+            continue;
+        }
+        BH_VERIFY(inSize > 0);
+
+        outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit);
+        BH_VERIFY(inSize == outSize);
+        BH_VERIFY(unit == i);
+    }
+
+    return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+    BH_UNUSED(argc);
+    BH_UNUSED(argv);
+
+    BH_UNIT_ADD(Case);
+    BH_UNIT_ADD(Utf8);
+    BH_UNIT_ADD(Utf16);
+    BH_UNIT_ADD(Utf32);
+
+    return BH_UnitRun();
+}
diff --git a/util/whitespace.sh b/util/whitespace.sh
index f661aa3..325683e 100755
--- a/util/whitespace.sh
+++ b/util/whitespace.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \;
+find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \;
 find . -iname "*.bak" -exec rm {} \;