aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md3
-rw-r--r--doc/Examples/CMakeLists.txt5
-rw-r--r--doc/Examples/UTF-8-test.txtbin0 -> 22781 bytes
-rw-r--r--doc/Examples/Utf8Test.c66
-rw-r--r--doc/Features.md6
-rw-r--r--doc/HowTo.md2
-rw-r--r--doc/HowTo/PakReader.md8
-rw-r--r--doc/HowTo/Utf8Test.md153
-rw-r--r--src/String/Unicode.c13
-rw-r--r--test/src/TestUnicode.c225
-rwxr-xr-xutil/whitespace.sh2
11 files changed, 472 insertions, 11 deletions
diff --git a/README.md b/README.md
index f283547..7560ae6 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ Here is a short list of implemented features:
- Abstraction over input/output
- Basic data structures and algorithms (hashmap, queue, heaps, partitions)
- Geomtric primitives (vectors, matrices, quaternions, rays, boxes)
+- Thread support functions and structures (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion from/to numbers
For more information about currently implemented and planned features checkout
[this page](doc/Features.md)
diff --git a/doc/Examples/CMakeLists.txt b/doc/Examples/CMakeLists.txt
index 7adcf05..5a31dc3 100644
--- a/doc/Examples/CMakeLists.txt
+++ b/doc/Examples/CMakeLists.txt
@@ -1,3 +1,6 @@
# PakReader
add_executable(PakReader PakReader.c)
-target_link_libraries(PakReader BHLib) \ No newline at end of file
+target_link_libraries(PakReader BHLib)
+
+add_executable(Utf8Test Utf8Test.c)
+target_link_libraries(Utf8Test BHLib) \ No newline at end of file
diff --git a/doc/Examples/UTF-8-test.txt b/doc/Examples/UTF-8-test.txt
new file mode 100644
index 0000000..a5b5d50
--- /dev/null
+++ b/doc/Examples/UTF-8-test.txt
Binary files differ
diff --git a/doc/Examples/Utf8Test.c b/doc/Examples/Utf8Test.c
new file mode 100644
index 0000000..e6d9e56
--- /dev/null
+++ b/doc/Examples/Utf8Test.c
@@ -0,0 +1,66 @@
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+ printf("Utf8Test <input> <output>\n");
+ exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+ BH_IO *inFile, *outFile;
+ char inBuffer[8], outBuffer[8];
+ uint32_t unit;
+ size_t i, inSize, outSize;
+
+ if (argc < 2)
+ printUsage();
+
+ inFile = BH_FileNew(argv[1]);
+ outFile = BH_FileNew(argv[2]);
+
+ if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+ return -1;
+
+ if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+ return -1;
+
+ inSize = 0;
+ while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+ {
+ /* Read one byte and try to decode */
+ if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+ {
+ BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+ inSize += outSize;
+ continue;
+ }
+
+ /* Remove readed amount */
+ for (i = 0; i < inSize - outSize; i++)
+ inBuffer[i] = inBuffer[i + outSize];
+ inSize -= outSize;
+
+ /* Change unit if incorrect and write to output */
+ if (unit == -1)
+ unit = 0xFFFD;
+ outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+ BH_IOWrite(outFile, outBuffer, outSize, NULL);
+ }
+
+ /* Incomplete UTF-8 sequence */
+ if (inSize)
+ {
+ outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+ BH_IOWrite(outFile, outBuffer, outSize, NULL);
+ }
+
+ BH_IOFree(inFile);
+ BH_IOFree(outFile);
+ return 0;
+}
diff --git a/doc/Features.md b/doc/Features.md
index 8dd7926..31da745 100644
--- a/doc/Features.md
+++ b/doc/Features.md
@@ -11,15 +11,15 @@ Currently implemented features:
- Intersection calculation (ray, boxes, segments, lines, planes, triangles)
- Unit testing library (for internal usage)
- Command-line interface utilities
+- Thread support (thread, mutex, cv, atomics, etc.)
+- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
+- String conversion functions from/to numbers
## Planned features
Currently planned features:
-
-- Thread support (thread, mutex, cv, atomics, etc.)
- Image loading/processing/saving support
- Polygon rasterization (possibly canvas)
-- UTF8 string support (BMP handling)
- Font rendering
- GUI (Windows GUI, X11)
- Deflate/Inflate implementation
diff --git a/doc/HowTo.md b/doc/HowTo.md
index 37dd9a6..8fcafd5 100644
--- a/doc/HowTo.md
+++ b/doc/HowTo.md
@@ -4,3 +4,5 @@ For the time being there is only one HowTo guide:
- [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of
using IO, Args and Utils modules.
+- [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics
+ of using IO and String modules.
diff --git a/doc/HowTo/PakReader.md b/doc/HowTo/PakReader.md
index e7071c3..4f75399 100644
--- a/doc/HowTo/PakReader.md
+++ b/doc/HowTo/PakReader.md
@@ -323,7 +323,7 @@ static int CopyData(BH_IO *from,
if (BH_IORead(from, tmp, length, &actual) || length != actual)
return BH_ERROR;
-
+
if (BH_IOWrite(to, tmp, length, &actual) || length != actual)
return BH_ERROR;
}
@@ -344,7 +344,7 @@ static int ProcessPack(Config *config,
/* Read header and seek to begging of the file table */
if (ParseHeader(io, &header))
return BH_ERROR;
-
+
if (BH_IOSeek(io, header.offset, BH_IO_SEEK_SET))
return BH_ERROR;
@@ -362,7 +362,7 @@ static int ProcessPack(Config *config,
continue;
output = BH_FileNew(config->output);
- if (BH_IOOpen(output, BH_IO_WRITE) ||
+ if (BH_IOOpen(output, BH_IO_WRITE) ||
BH_IOSeek(io, entry.offset, BH_IO_SEEK_SET) ||
CopyData(io, output, entry.size))
{
@@ -374,7 +374,7 @@ static int ProcessPack(Config *config,
return BH_OK;
}
}
-
+
if (config->list)
return BH_OK;
return BH_ERROR;
diff --git a/doc/HowTo/Utf8Test.md b/doc/HowTo/Utf8Test.md
new file mode 100644
index 0000000..7e82ef0
--- /dev/null
+++ b/doc/HowTo/Utf8Test.md
@@ -0,0 +1,153 @@
+# HowTo: Transcoding UTF-8 to UTF-8
+
+## Prerequisites
+
+We want to implement a simple command-line utility that can transcode a UTF-8
+file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences).
+
+To do this we would run the following command:
+
+```sh
+./Utf8Test UTF-8-test.txt UTF-8-out.txt
+```
+
+## Includes
+
+To implement this utility, we are going to need to include the following headers:
+
+- `BH/IO.h` to work with files (or input/output devices)
+- `BH/String.h` to work with UTF-8 sequences
+
+## Working with Files
+
+Working with files in BHLib is based around the IO device (called `BH_IO`).
+Firstly, you need to create an IO device with the `BH_FileNew` function.
+Secondly, you need to open the IO device with the `BH_IOOpen` function. While
+opening the IO device, you can specify in which mode it will work: reading
+(`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether
+the IO device (or in our case, the file) should exist before opening
+(`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be
+created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`).
+
+Here is an example for opening an existing file in read-only mode:
+
+```c
+BH_IO *io = BH_FileNew("coolfile.dat");
+if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST))
+{
+ printf("Can't open file 'coolfile.dat'\n", config.file);
+ BH_IOFree(io);
+ return -1;
+}
+```
+
+## Working with UTF-8
+
+Reading UTF-8/UTF-16/UTF-32 is based around simple loop:
+
+1. Read bytes from input (IO or memory) to some buffer.
+2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer.
+3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD.
+
+Writing UTF-8/UTF-16/UTF-32 is straight forward:
+
+1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range).
+2. Write data (to IO or memory).
+
+BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)
+
+```c
+
+while (...)
+{
+ /* Read one byte and try to decode */
+ if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+ {
+ BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+ inSize += outSize;
+ continue;
+ }
+
+ /* Remove readed amount */
+ for (i = 0; i < inSize - outSize; i++)
+ inBuffer[i] = inBuffer[i + outSize];
+ inSize -= outSize;
+
+ /* Change unit if incorrect and write to output */
+ if (unit == -1)
+ unit = 0xFFFD;
+ outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+ BH_IOWrite(outFile, outBuffer, outSize, NULL);
+}
+```
+
+## Putting Everything Together
+
+```c
+#include <BH/IO.h>
+#include <BH/String.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void printUsage(void)
+{
+ printf("Utf8Test <input> <output>\n");
+ exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+ BH_IO *inFile, *outFile;
+ char inBuffer[8], outBuffer[8];
+ uint32_t unit;
+ size_t i, inSize, outSize;
+
+ if (argc < 2)
+ printUsage();
+
+ inFile = BH_FileNew(argv[1]);
+ outFile = BH_FileNew(argv[2]);
+
+ if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
+ return -1;
+
+ if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
+ return -1;
+
+ inSize = 0;
+ while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
+ {
+ /* Read one byte and try to decode */
+ if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
+ {
+ BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
+ inSize += outSize;
+ continue;
+ }
+
+ /* Remove readed amount */
+ for (i = 0; i < inSize - outSize; i++)
+ inBuffer[i] = inBuffer[i + outSize];
+ inSize -= outSize;
+
+ /* Change unit if incorrect and write to output */
+ if (unit == -1)
+ unit = 0xFFFD;
+ outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
+ BH_IOWrite(outFile, outBuffer, outSize, NULL);
+ }
+
+ /* Incomplete UTF-8 sequence */
+ if (inSize)
+ {
+ outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
+ BH_IOWrite(outFile, outBuffer, outSize, NULL);
+ }
+
+ BH_IOFree(inFile);
+ BH_IOFree(outFile);
+ return 0;
+}
+```
diff --git a/src/String/Unicode.c b/src/String/Unicode.c
index 1f0eaf0..26b9670 100644
--- a/src/String/Unicode.c
+++ b/src/String/Unicode.c
@@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit,
{
size_t result;
+ if (unit > 0xD7FF && unit < 0xE000)
+ return 0;
+
result = 0;
if (unit < 0x80ul)
{
@@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string,
size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
char *string)
{
+ if (unit > 0xD7FF && unit < 0xE000)
+ return 0;
+
if (unit < 0x10000)
{
BH_Write16LEu(string, unit);
@@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
char *string)
{
+ if (unit > 0xD7FF && unit < 0xE000)
+ return 0;
+
if (unit < 0x10000)
{
BH_Write16BEu(string, unit);
@@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string,
size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
char *string)
{
- if (unit > 0x1FFFFF)
+ if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
return 0;
BH_Write32LEu(string, unit);
@@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
char *string)
{
- if (unit > 0x1FFFFF)
+ if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
return 0;
BH_Write32BEu(string, unit);
diff --git a/test/src/TestUnicode.c b/test/src/TestUnicode.c
new file mode 100644
index 0000000..8ee23b7
--- /dev/null
+++ b/test/src/TestUnicode.c
@@ -0,0 +1,225 @@
+#include <BH/Unit.h>
+#include <BH/String.h>
+#include <BH/IO.h>
+#include <stdlib.h>
+
+
+struct TestCase
+{
+ char *input;
+ size_t size;
+ size_t read;
+ uint32_t result;
+};
+
+
+BH_UNIT_TEST(Case)
+{
+ size_t i, j;
+
+ for (i = 0; i < 0x110000; i++)
+ {
+ j = BH_UnicodeLower(i);
+ if (j == i)
+ {
+ j = BH_UnicodeUpper(i);
+ j = BH_UnicodeLower(j);
+ }
+ else
+ j = BH_UnicodeUpper(j);
+
+ /* Some exceptions */
+ if (i == 0x130 && j == 0x49)
+ continue;
+ else if (i == 0x131 && j == 0x69)
+ continue;
+ else if (i == 0x1C5 && j == 0x1C4)
+ continue;
+ else if (i == 0x1C8 && j == 0x1C7)
+ continue;
+ else if (i == 0x1CB && j == 0x1CA)
+ continue;
+
+ BH_VERIFY(i == j);
+ }
+
+ return 0;
+}
+
+
+BH_UNIT_TEST(Utf8)
+{
+ const struct TestCase *current;
+ const struct TestCase cases[] =
+ {
+ /* Normal cases */
+ {"\x00", 1, 1, 0},
+ {"\xC2\x80", 2, 2, 0x80},
+ {"\xE0\xA0\x80", 3, 3, 0x800},
+ {"\xF0\x90\x80\x80", 4, 4, 0x10000},
+ {"\x7F", 1, 1, 0x7F},
+ {"\xDF\xBF", 2, 2, 0x7FF},
+ {"\xEF\xBF\xBF", 3, 3, 0xFFFF},
+ {"\xED\x9F\xBF", 3, 3, 0xD7FF},
+ {"\xEE\x80\x80", 3, 3, 0xE000},
+ {"\xEF\xBF\xBD", 3, 3, 0xFFFD},
+ {"H", 1, 1, 'H'},
+ {"\xCE\xBA", 2, 2, 0x3BA},
+
+ /* Lonely start characters */
+ {"\xC0 ", 2, 1, -1},
+ {"\xC1 ", 2, 1, -1},
+ {"\xC2 ", 2, 1, -1},
+ {"\xC3 ", 2, 1, -1},
+ {"\xC4 ", 2, 1, -1},
+
+ /* Malformed sequences */
+ {"\x80", 1, -1, -1},
+ {"\xBF", 1, -1, -1},
+ {"\xFE", 1, -1, -1},
+ {"\xFF", 1, -1, -1},
+
+ /* Overlong sequences */
+ {"\xC0\xAF", 2, 1, -1},
+
+ /* UTF-16 surrogate pairs */
+ {"\xED\xA0\x80", 3, -1, -1},
+ {"\xED\xAE\x80", 3, -1, -1},
+ {"\xED\xB0\x80", 3, -1, -1},
+ {"\xED\xBF\xBF", 3, -1, -1},
+
+ {NULL, 0, 0, 0}
+ };
+ char buffer[8];
+ uint32_t unit;
+ size_t i, outSize, inSize;
+
+ /* Encode and decode all character in a valid UTF-8 range */
+ for (i = 0; i < 0x110000; i++)
+ {
+ inSize = BH_UnicodeEncodeUtf8(i, buffer);
+
+ /* Check for surrogate pairs */
+ if (i > 0xD7FF && i < 0xE000)
+ {
+ BH_VERIFY(inSize == 0);
+ continue;
+ }
+
+ BH_VERIFY(inSize > 0);
+ outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit);
+ BH_VERIFY(inSize == outSize);
+ BH_VERIFY(unit == i);
+ }
+
+ /* Test special cases */
+ for (current = cases; current->input; current++)
+ {
+ i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit);
+ if (current->read == (size_t)-1 && i)
+ i = -1;
+
+ if (i != current->read || unit != current->result)
+ {
+ printf("\tcase %d\n", (int)(current - cases));
+ BH_VERIFY(i == current->read);
+ BH_VERIFY(unit == current->result);
+ }
+ }
+
+ return 0;
+}
+
+
+BH_UNIT_TEST(Utf16)
+{
+ char buffer[8];
+ uint32_t unit;
+ size_t i, outSize, inSize;
+
+ /* Encode and decode all character in a valid UTF-8 range */
+ for (i = 0; i < 0x110000; i++)
+ {
+ /* Check for little endian */
+ inSize = BH_UnicodeEncodeUtf16LE(i, buffer);
+ if (i > 0xD7FF && i < 0xE000)
+ {
+ BH_VERIFY(inSize == 0);
+ continue;
+ }
+ BH_VERIFY(inSize > 0);
+
+ outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit);
+ BH_VERIFY(inSize == outSize);
+ BH_VERIFY(unit == i);
+
+ /* Check for big endian */
+ inSize = BH_UnicodeEncodeUtf16BE(i, buffer);
+ if (i > 0xD7FF && i < 0xE000)
+ {
+ BH_VERIFY(inSize == 0);
+ continue;
+ }
+ BH_VERIFY(inSize > 0);
+
+ outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit);
+ BH_VERIFY(inSize == outSize);
+ BH_VERIFY(unit == i);
+ }
+
+ return 0;
+}
+
+
+BH_UNIT_TEST(Utf32)
+{
+ char buffer[8];
+ uint32_t unit;
+ size_t i, outSize, inSize;
+
+ /* Encode and decode all character in a valid UTF-8 range */
+ for (i = 0; i < 0x110000; i++)
+ {
+ /* Check for little endian */
+ inSize = BH_UnicodeEncodeUtf32LE(i, buffer);
+ if (i > 0xD7FF && i < 0xE000)
+ {
+ BH_VERIFY(inSize == 0);
+ continue;
+ }
+ BH_VERIFY(inSize > 0);
+
+ outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit);
+ BH_VERIFY(inSize == outSize);
+ BH_VERIFY(unit == i);
+
+ /* Check for big endian */
+ inSize = BH_UnicodeEncodeUtf32BE(i, buffer);
+ if (i > 0xD7FF && i < 0xE000)
+ {
+ BH_VERIFY(inSize == 0);
+ continue;
+ }
+ BH_VERIFY(inSize > 0);
+
+ outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit);
+ BH_VERIFY(inSize == outSize);
+ BH_VERIFY(unit == i);
+ }
+
+ return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+ BH_UNUSED(argc);
+ BH_UNUSED(argv);
+
+ BH_UNIT_ADD(Case);
+ BH_UNIT_ADD(Utf8);
+ BH_UNIT_ADD(Utf16);
+ BH_UNIT_ADD(Utf32);
+
+ return BH_UnitRun();
+}
diff --git a/util/whitespace.sh b/util/whitespace.sh
index f661aa3..325683e 100755
--- a/util/whitespace.sh
+++ b/util/whitespace.sh
@@ -1,3 +1,3 @@
#!/bin/sh
-find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \;
+find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \;
find . -iname "*.bak" -exec rm {} \;