diff options
| -rw-r--r-- | README.md | 3 | ||||
| -rw-r--r-- | doc/Examples/CMakeLists.txt | 5 | ||||
| -rw-r--r-- | doc/Examples/UTF-8-test.txt | bin | 0 -> 22781 bytes | |||
| -rw-r--r-- | doc/Examples/Utf8Test.c | 66 | ||||
| -rw-r--r-- | doc/Features.md | 6 | ||||
| -rw-r--r-- | doc/HowTo.md | 2 | ||||
| -rw-r--r-- | doc/HowTo/PakReader.md | 8 | ||||
| -rw-r--r-- | doc/HowTo/Utf8Test.md | 153 | ||||
| -rw-r--r-- | src/String/Unicode.c | 13 | ||||
| -rw-r--r-- | test/src/TestUnicode.c | 225 | ||||
| -rwxr-xr-x | util/whitespace.sh | 2 |
11 files changed, 472 insertions, 11 deletions
@@ -9,6 +9,9 @@ Here is a short list of implemented features: - Abstraction over input/output - Basic data structures and algorithms (hashmap, queue, heaps, partitions) - Geomtric primitives (vectors, matrices, quaternions, rays, boxes) +- Thread support functions and structures (thread, mutex, cv, atomics, etc.) +- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32) +- String conversion from/to numbers For more information about currently implemented and planned features checkout [this page](doc/Features.md) diff --git a/doc/Examples/CMakeLists.txt b/doc/Examples/CMakeLists.txt index 7adcf05..5a31dc3 100644 --- a/doc/Examples/CMakeLists.txt +++ b/doc/Examples/CMakeLists.txt @@ -1,3 +1,6 @@ # PakReader add_executable(PakReader PakReader.c) -target_link_libraries(PakReader BHLib)
\ No newline at end of file +target_link_libraries(PakReader BHLib) + +add_executable(Utf8Test Utf8Test.c) +target_link_libraries(Utf8Test BHLib)
\ No newline at end of file diff --git a/doc/Examples/UTF-8-test.txt b/doc/Examples/UTF-8-test.txt Binary files differnew file mode 100644 index 0000000..a5b5d50 --- /dev/null +++ b/doc/Examples/UTF-8-test.txt diff --git a/doc/Examples/Utf8Test.c b/doc/Examples/Utf8Test.c new file mode 100644 index 0000000..e6d9e56 --- /dev/null +++ b/doc/Examples/Utf8Test.c @@ -0,0 +1,66 @@ +#include <BH/IO.h> +#include <BH/String.h> +#include <stdlib.h> +#include <stdio.h> + + +void printUsage(void) +{ + printf("Utf8Test <input> <output>\n"); + exit(1); +} + + +int main(int argc, char **argv) +{ + BH_IO *inFile, *outFile; + char inBuffer[8], outBuffer[8]; + uint32_t unit; + size_t i, inSize, outSize; + + if (argc < 2) + printUsage(); + + inFile = BH_FileNew(argv[1]); + outFile = BH_FileNew(argv[2]); + + if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST)) + return -1; + + if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE)) + return -1; + + inSize = 0; + while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF)) + { + /* Read one byte and try to decode */ + if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit))) + { + BH_IORead(inFile, inBuffer + inSize, 1, &outSize); + inSize += outSize; + continue; + } + + /* Remove readed amount */ + for (i = 0; i < inSize - outSize; i++) + inBuffer[i] = inBuffer[i + outSize]; + inSize -= outSize; + + /* Change unit if incorrect and write to output */ + if (unit == -1) + unit = 0xFFFD; + outSize = BH_UnicodeEncodeUtf8(unit, outBuffer); + BH_IOWrite(outFile, outBuffer, outSize, NULL); + } + + /* Incomplete UTF-8 sequence */ + if (inSize) + { + outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer); + BH_IOWrite(outFile, outBuffer, outSize, NULL); + } + + BH_IOFree(inFile); + BH_IOFree(outFile); + return 0; +} diff --git a/doc/Features.md b/doc/Features.md index 8dd7926..31da745 100644 --- a/doc/Features.md +++ b/doc/Features.md @@ -11,15 +11,15 @@ Currently implemented features: - Intersection calculation (ray, boxes, segments, lines, planes, triangles) - Unit testing library (for internal usage) - Command-line interface utilities +- Thread support (thread, mutex, cv, atomics, etc.) +- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32) +- String conversion functions from/to numbers ## Planned features Currently planned features: - -- Thread support (thread, mutex, cv, atomics, etc.) - Image loading/processing/saving support - Polygon rasterization (possibly canvas) -- UTF8 string support (BMP handling) - Font rendering - GUI (Windows GUI, X11) - Deflate/Inflate implementation diff --git a/doc/HowTo.md b/doc/HowTo.md index 37dd9a6..8fcafd5 100644 --- a/doc/HowTo.md +++ b/doc/HowTo.md @@ -4,3 +4,5 @@ For the time being there is only one HowTo guide: - [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of using IO, Args and Utils modules. +- [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics + of using IO and String modules. diff --git a/doc/HowTo/PakReader.md b/doc/HowTo/PakReader.md index e7071c3..4f75399 100644 --- a/doc/HowTo/PakReader.md +++ b/doc/HowTo/PakReader.md @@ -323,7 +323,7 @@ static int CopyData(BH_IO *from, if (BH_IORead(from, tmp, length, &actual) || length != actual) return BH_ERROR; - + if (BH_IOWrite(to, tmp, length, &actual) || length != actual) return BH_ERROR; } @@ -344,7 +344,7 @@ static int ProcessPack(Config *config, /* Read header and seek to begging of the file table */ if (ParseHeader(io, &header)) return BH_ERROR; - + if (BH_IOSeek(io, header.offset, BH_IO_SEEK_SET)) return BH_ERROR; @@ -362,7 +362,7 @@ static int ProcessPack(Config *config, continue; output = BH_FileNew(config->output); - if (BH_IOOpen(output, BH_IO_WRITE) || + if (BH_IOOpen(output, BH_IO_WRITE) || BH_IOSeek(io, entry.offset, BH_IO_SEEK_SET) || CopyData(io, output, entry.size)) { @@ -374,7 +374,7 @@ static int ProcessPack(Config *config, return BH_OK; } } - + if (config->list) return BH_OK; return BH_ERROR; diff --git a/doc/HowTo/Utf8Test.md b/doc/HowTo/Utf8Test.md new file mode 100644 index 0000000..7e82ef0 --- /dev/null +++ b/doc/HowTo/Utf8Test.md @@ -0,0 +1,153 @@ +# HowTo: Transcoding UTF-8 to UTF-8 + +## Prerequisites + +We want to implement a simple command-line utility that can transcode a UTF-8 +file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences). + +To do this we would run the following command: + +```sh +./Utf8Test UTF-8-test.txt UTF-8-out.txt +``` + +## Includes + +To implement this utility, we are going to need to include the following headers: + +- `BH/IO.h` to work with files (or input/output devices) +- `BH/String.h` to work with UTF-8 sequences + +## Working with Files + +Working with files in BHLib is based around the IO device (called `BH_IO`). +Firstly, you need to create an IO device with the `BH_FileNew` function. +Secondly, you need to open the IO device with the `BH_IOOpen` function. While +opening the IO device, you can specify in which mode it will work: reading +(`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether +the IO device (or in our case, the file) should exist before opening +(`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be +created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`). + +Here is an example for opening an existing file in read-only mode: + +```c +BH_IO *io = BH_FileNew("coolfile.dat"); +if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST)) +{ + printf("Can't open file 'coolfile.dat'\n", config.file); + BH_IOFree(io); + return -1; +} +``` + +## Working with UTF-8 + +Reading UTF-8/UTF-16/UTF-32 is based around simple loop: + +1. Read bytes from input (IO or memory) to some buffer. +2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer. +3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD. + +Writing UTF-8/UTF-16/UTF-32 is straight forward: + +1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range). +2. Write data (to IO or memory). + +BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit) + +```c + +while (...) +{ + /* Read one byte and try to decode */ + if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit))) + { + BH_IORead(inFile, inBuffer + inSize, 1, &outSize); + inSize += outSize; + continue; + } + + /* Remove readed amount */ + for (i = 0; i < inSize - outSize; i++) + inBuffer[i] = inBuffer[i + outSize]; + inSize -= outSize; + + /* Change unit if incorrect and write to output */ + if (unit == -1) + unit = 0xFFFD; + outSize = BH_UnicodeEncodeUtf8(unit, outBuffer); + BH_IOWrite(outFile, outBuffer, outSize, NULL); +} +``` + +## Putting Everything Together + +```c +#include <BH/IO.h> +#include <BH/String.h> +#include <stdlib.h> +#include <stdio.h> + + +void printUsage(void) +{ + printf("Utf8Test <input> <output>\n"); + exit(1); +} + + +int main(int argc, char **argv) +{ + BH_IO *inFile, *outFile; + char inBuffer[8], outBuffer[8]; + uint32_t unit; + size_t i, inSize, outSize; + + if (argc < 2) + printUsage(); + + inFile = BH_FileNew(argv[1]); + outFile = BH_FileNew(argv[2]); + + if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST)) + return -1; + + if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE)) + return -1; + + inSize = 0; + while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF)) + { + /* Read one byte and try to decode */ + if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit))) + { + BH_IORead(inFile, inBuffer + inSize, 1, &outSize); + inSize += outSize; + continue; + } + + /* Remove readed amount */ + for (i = 0; i < inSize - outSize; i++) + inBuffer[i] = inBuffer[i + outSize]; + inSize -= outSize; + + /* Change unit if incorrect and write to output */ + if (unit == -1) + unit = 0xFFFD; + outSize = BH_UnicodeEncodeUtf8(unit, outBuffer); + BH_IOWrite(outFile, outBuffer, outSize, NULL); + } + + /* Incomplete UTF-8 sequence */ + if (inSize) + { + outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer); + BH_IOWrite(outFile, outBuffer, outSize, NULL); + } + + BH_IOFree(inFile); + BH_IOFree(outFile); + return 0; +} +``` diff --git a/src/String/Unicode.c b/src/String/Unicode.c index 1f0eaf0..26b9670 100644 --- a/src/String/Unicode.c +++ b/src/String/Unicode.c @@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit, { size_t result; + if (unit > 0xD7FF && unit < 0xE000) + return 0; + result = 0; if (unit < 0x80ul) { @@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string, size_t BH_UnicodeEncodeUtf16LE(uint32_t unit, char *string) { + if (unit > 0xD7FF && unit < 0xE000) + return 0; + if (unit < 0x10000) { BH_Write16LEu(string, unit); @@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit, size_t BH_UnicodeEncodeUtf16BE(uint32_t unit, char *string) { + if (unit > 0xD7FF && unit < 0xE000) + return 0; + if (unit < 0x10000) { BH_Write16BEu(string, unit); @@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string, size_t BH_UnicodeEncodeUtf32LE(uint32_t unit, char *string) { - if (unit > 0x1FFFFF) + if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000)) return 0; BH_Write32LEu(string, unit); @@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit, size_t BH_UnicodeEncodeUtf32BE(uint32_t unit, char *string) { - if (unit > 0x1FFFFF) + if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000)) return 0; BH_Write32BEu(string, unit); diff --git a/test/src/TestUnicode.c b/test/src/TestUnicode.c new file mode 100644 index 0000000..8ee23b7 --- /dev/null +++ b/test/src/TestUnicode.c @@ -0,0 +1,225 @@ +#include <BH/Unit.h> +#include <BH/String.h> +#include <BH/IO.h> +#include <stdlib.h> + + +struct TestCase +{ + char *input; + size_t size; + size_t read; + uint32_t result; +}; + + +BH_UNIT_TEST(Case) +{ + size_t i, j; + + for (i = 0; i < 0x110000; i++) + { + j = BH_UnicodeLower(i); + if (j == i) + { + j = BH_UnicodeUpper(i); + j = BH_UnicodeLower(j); + } + else + j = BH_UnicodeUpper(j); + + /* Some exceptions */ + if (i == 0x130 && j == 0x49) + continue; + else if (i == 0x131 && j == 0x69) + continue; + else if (i == 0x1C5 && j == 0x1C4) + continue; + else if (i == 0x1C8 && j == 0x1C7) + continue; + else if (i == 0x1CB && j == 0x1CA) + continue; + + BH_VERIFY(i == j); + } + + return 0; +} + + +BH_UNIT_TEST(Utf8) +{ + const struct TestCase *current; + const struct TestCase cases[] = + { + /* Normal cases */ + {"\x00", 1, 1, 0}, + {"\xC2\x80", 2, 2, 0x80}, + {"\xE0\xA0\x80", 3, 3, 0x800}, + {"\xF0\x90\x80\x80", 4, 4, 0x10000}, + {"\x7F", 1, 1, 0x7F}, + {"\xDF\xBF", 2, 2, 0x7FF}, + {"\xEF\xBF\xBF", 3, 3, 0xFFFF}, + {"\xED\x9F\xBF", 3, 3, 0xD7FF}, + {"\xEE\x80\x80", 3, 3, 0xE000}, + {"\xEF\xBF\xBD", 3, 3, 0xFFFD}, + {"H", 1, 1, 'H'}, + {"\xCE\xBA", 2, 2, 0x3BA}, + + /* Lonely start characters */ + {"\xC0 ", 2, 1, -1}, + {"\xC1 ", 2, 1, -1}, + {"\xC2 ", 2, 1, -1}, + {"\xC3 ", 2, 1, -1}, + {"\xC4 ", 2, 1, -1}, + + /* Malformed sequences */ + {"\x80", 1, -1, -1}, + {"\xBF", 1, -1, -1}, + {"\xFE", 1, -1, -1}, + {"\xFF", 1, -1, -1}, + + /* Overlong sequences */ + {"\xC0\xAF", 2, 1, -1}, + + /* UTF-16 surrogate pairs */ + {"\xED\xA0\x80", 3, -1, -1}, + {"\xED\xAE\x80", 3, -1, -1}, + {"\xED\xB0\x80", 3, -1, -1}, + {"\xED\xBF\xBF", 3, -1, -1}, + + {NULL, 0, 0, 0} + }; + char buffer[8]; + uint32_t unit; + size_t i, outSize, inSize; + + /* Encode and decode all character in a valid UTF-8 range */ + for (i = 0; i < 0x110000; i++) + { + inSize = BH_UnicodeEncodeUtf8(i, buffer); + + /* Check for surrogate pairs */ + if (i > 0xD7FF && i < 0xE000) + { + BH_VERIFY(inSize == 0); + continue; + } + + BH_VERIFY(inSize > 0); + outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit); + BH_VERIFY(inSize == outSize); + BH_VERIFY(unit == i); + } + + /* Test special cases */ + for (current = cases; current->input; current++) + { + i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit); + if (current->read == (size_t)-1 && i) + i = -1; + + if (i != current->read || unit != current->result) + { + printf("\tcase %d\n", (int)(current - cases)); + BH_VERIFY(i == current->read); + BH_VERIFY(unit == current->result); + } + } + + return 0; +} + + +BH_UNIT_TEST(Utf16) +{ + char buffer[8]; + uint32_t unit; + size_t i, outSize, inSize; + + /* Encode and decode all character in a valid UTF-8 range */ + for (i = 0; i < 0x110000; i++) + { + /* Check for little endian */ + inSize = BH_UnicodeEncodeUtf16LE(i, buffer); + if (i > 0xD7FF && i < 0xE000) + { + BH_VERIFY(inSize == 0); + continue; + } + BH_VERIFY(inSize > 0); + + outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit); + BH_VERIFY(inSize == outSize); + BH_VERIFY(unit == i); + + /* Check for big endian */ + inSize = BH_UnicodeEncodeUtf16BE(i, buffer); + if (i > 0xD7FF && i < 0xE000) + { + BH_VERIFY(inSize == 0); + continue; + } + BH_VERIFY(inSize > 0); + + outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit); + BH_VERIFY(inSize == outSize); + BH_VERIFY(unit == i); + } + + return 0; +} + + +BH_UNIT_TEST(Utf32) +{ + char buffer[8]; + uint32_t unit; + size_t i, outSize, inSize; + + /* Encode and decode all character in a valid UTF-8 range */ + for (i = 0; i < 0x110000; i++) + { + /* Check for little endian */ + inSize = BH_UnicodeEncodeUtf32LE(i, buffer); + if (i > 0xD7FF && i < 0xE000) + { + BH_VERIFY(inSize == 0); + continue; + } + BH_VERIFY(inSize > 0); + + outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit); + BH_VERIFY(inSize == outSize); + BH_VERIFY(unit == i); + + /* Check for big endian */ + inSize = BH_UnicodeEncodeUtf32BE(i, buffer); + if (i > 0xD7FF && i < 0xE000) + { + BH_VERIFY(inSize == 0); + continue; + } + BH_VERIFY(inSize > 0); + + outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit); + BH_VERIFY(inSize == outSize); + BH_VERIFY(unit == i); + } + + return 0; +} + + +int main(int argc, char **argv) +{ + BH_UNUSED(argc); + BH_UNUSED(argv); + + BH_UNIT_ADD(Case); + BH_UNIT_ADD(Utf8); + BH_UNIT_ADD(Utf16); + BH_UNIT_ADD(Utf32); + + return BH_UnitRun(); +} diff --git a/util/whitespace.sh b/util/whitespace.sh index f661aa3..325683e 100755 --- a/util/whitespace.sh +++ b/util/whitespace.sh @@ -1,3 +1,3 @@ #!/bin/sh -find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \; +find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \; find . -iname "*.bak" -exec rm {} \; |
