Add Unicode tests, fix whitespace, fix docs

Added Unicode tests (and fixed few bugs), changed whitespace script to
proccess markdown files, added new guide to showcase how to work with
Unicode.
This commit is contained in:
2025-04-06 14:11:38 +03:00
parent 6aee5a83aa
commit dd15b42b44
11 changed files with 472 additions and 11 deletions

View File

@@ -9,6 +9,9 @@ Here is a short list of implemented features:
- Abstraction over input/output - Abstraction over input/output
- Basic data structures and algorithms (hashmap, queue, heaps, partitions) - Basic data structures and algorithms (hashmap, queue, heaps, partitions)
- Geomtric primitives (vectors, matrices, quaternions, rays, boxes) - Geomtric primitives (vectors, matrices, quaternions, rays, boxes)
- Thread support functions and structures (thread, mutex, cv, atomics, etc.)
- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
- String conversion from/to numbers
For more information about currently implemented and planned features checkout For more information about currently implemented and planned features checkout
[this page](doc/Features.md) [this page](doc/Features.md)

View File

@@ -1,3 +1,6 @@
# PakReader # PakReader
add_executable(PakReader PakReader.c) add_executable(PakReader PakReader.c)
target_link_libraries(PakReader BHLib) target_link_libraries(PakReader BHLib)
add_executable(Utf8Test Utf8Test.c)
target_link_libraries(Utf8Test BHLib)

BIN
doc/Examples/UTF-8-test.txt Normal file

Binary file not shown.

66
doc/Examples/Utf8Test.c Normal file
View File

@@ -0,0 +1,66 @@
#include <BH/IO.h>
#include <BH/String.h>
#include <stdlib.h>
#include <stdio.h>
void printUsage(void)
{
printf("Utf8Test <input> <output>\n");
exit(1);
}
int main(int argc, char **argv)
{
BH_IO *inFile, *outFile;
char inBuffer[8], outBuffer[8];
uint32_t unit;
size_t i, inSize, outSize;
if (argc < 2)
printUsage();
inFile = BH_FileNew(argv[1]);
outFile = BH_FileNew(argv[2]);
if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
return -1;
if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
return -1;
inSize = 0;
while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
{
/* Read one byte and try to decode */
if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
{
BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
inSize += outSize;
continue;
}
/* Remove readed amount */
for (i = 0; i < inSize - outSize; i++)
inBuffer[i] = inBuffer[i + outSize];
inSize -= outSize;
/* Change unit if incorrect and write to output */
if (unit == -1)
unit = 0xFFFD;
outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
BH_IOWrite(outFile, outBuffer, outSize, NULL);
}
/* Incomplete UTF-8 sequence */
if (inSize)
{
outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
BH_IOWrite(outFile, outBuffer, outSize, NULL);
}
BH_IOFree(inFile);
BH_IOFree(outFile);
return 0;
}

View File

@@ -11,15 +11,15 @@ Currently implemented features:
- Intersection calculation (ray, boxes, segments, lines, planes, triangles) - Intersection calculation (ray, boxes, segments, lines, planes, triangles)
- Unit testing library (for internal usage) - Unit testing library (for internal usage)
- Command-line interface utilities - Command-line interface utilities
- Thread support (thread, mutex, cv, atomics, etc.)
- Unicode support functions (BMP, UTF-8, UTF-16, UTF-32)
- String conversion functions from/to numbers
## Planned features ## Planned features
Currently planned features: Currently planned features:
- Thread support (thread, mutex, cv, atomics, etc.)
- Image loading/processing/saving support - Image loading/processing/saving support
- Polygon rasterization (possibly canvas) - Polygon rasterization (possibly canvas)
- UTF8 string support (BMP handling)
- Font rendering - Font rendering
- GUI (Windows GUI, X11) - GUI (Windows GUI, X11)
- Deflate/Inflate implementation - Deflate/Inflate implementation

View File

@@ -4,3 +4,5 @@ For the time being there is only one HowTo guide:
- [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of - [Writing PACK reader utility](HowTo/PakReader.md) which covers the basics of
using IO, Args and Utils modules. using IO, Args and Utils modules.
- [Basic UTF-8 to UTF-8 transcoder](HowTo/Utf8Test.md) which covers the basics
of using IO and String modules.

153
doc/HowTo/Utf8Test.md Normal file
View File

@@ -0,0 +1,153 @@
# HowTo: Transcoding UTF-8 to UTF-8
## Prerequisites
We want to implement a simple command-line utility that can transcode a UTF-8
file into UTF-8 file (or in other words replace any incorrect UTF-8 sequences).
To do this we would run the following command:
```sh
./Utf8Test UTF-8-test.txt UTF-8-out.txt
```
## Includes
To implement this utility, we are going to need to include the following headers:
- `BH/IO.h` to work with files (or input/output devices)
- `BH/String.h` to work with UTF-8 sequences
## Working with Files
Working with files in BHLib is based around the IO device (called `BH_IO`).
Firstly, you need to create an IO device with the `BH_FileNew` function.
Secondly, you need to open the IO device with the `BH_IOOpen` function. While
opening the IO device, you can specify in which mode it will work: reading
(`BH_IO_READ`) or writing (`BH_IO_WRITE`). Additionally, we can specify whether
the IO device (or in our case, the file) should exist before opening
(`BH_IO_EXIST`), be truncated before opening (`BH_IO_TRUNCATE`), should it be
created (`BH_IO_CREATE`), or opened in append mode (`BH_IO_APPEND`).
Here is an example for opening an existing file in read-only mode:
```c
BH_IO *io = BH_FileNew("coolfile.dat");
if (BH_IOOpen(io, BH_IO_READ | BH_IO_EXIST))
{
printf("Can't open file 'coolfile.dat'\n", config.file);
BH_IOFree(io);
return -1;
}
```
## Working with UTF-8
Reading UTF-8/UTF-16/UTF-32 is based around simple loop:
1. Read bytes from input (IO or memory) to some buffer.
2. Call `BH_UnicodeDecodeUtf*`. If return value is 0 - we don't have enough data, so go to step 1. Otherwise remove result bytes from the front of the buffer.
3. If readed codepoint equals -1 - we encountered an error, so replace it with the code 0xFFFD.
Writing UTF-8/UTF-16/UTF-32 is straight forward:
1. Call `BH_UnicodeEncodeUtf*`. If return value is 0 - we can't encode codepoint (either codepoint is surrogate pair or outside valid range).
2. Write data (to IO or memory).
BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)
```c
while (...)
{
/* Read one byte and try to decode */
if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
{
BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
inSize += outSize;
continue;
}
/* Remove readed amount */
for (i = 0; i < inSize - outSize; i++)
inBuffer[i] = inBuffer[i + outSize];
inSize -= outSize;
/* Change unit if incorrect and write to output */
if (unit == -1)
unit = 0xFFFD;
outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
BH_IOWrite(outFile, outBuffer, outSize, NULL);
}
```
## Putting Everything Together
```c
#include <BH/IO.h>
#include <BH/String.h>
#include <stdlib.h>
#include <stdio.h>
void printUsage(void)
{
printf("Utf8Test <input> <output>\n");
exit(1);
}
int main(int argc, char **argv)
{
BH_IO *inFile, *outFile;
char inBuffer[8], outBuffer[8];
uint32_t unit;
size_t i, inSize, outSize;
if (argc < 2)
printUsage();
inFile = BH_FileNew(argv[1]);
outFile = BH_FileNew(argv[2]);
if (!inFile || BH_IOOpen(inFile, BH_IO_READ | BH_IO_EXIST))
return -1;
if (!outFile || BH_IOOpen(outFile, BH_IO_WRITE | BH_IO_TRUNCATE))
return -1;
inSize = 0;
while (!(BH_IOFlags(inFile) & BH_IO_FLAG_EOF))
{
/* Read one byte and try to decode */
if (!inSize || !(outSize = BH_UnicodeDecodeUtf8(inBuffer, inSize, &unit)))
{
BH_IORead(inFile, inBuffer + inSize, 1, &outSize);
inSize += outSize;
continue;
}
/* Remove readed amount */
for (i = 0; i < inSize - outSize; i++)
inBuffer[i] = inBuffer[i + outSize];
inSize -= outSize;
/* Change unit if incorrect and write to output */
if (unit == -1)
unit = 0xFFFD;
outSize = BH_UnicodeEncodeUtf8(unit, outBuffer);
BH_IOWrite(outFile, outBuffer, outSize, NULL);
}
/* Incomplete UTF-8 sequence */
if (inSize)
{
outSize = BH_UnicodeEncodeUtf8(0xFFFD, outBuffer);
BH_IOWrite(outFile, outBuffer, outSize, NULL);
}
BH_IOFree(inFile);
BH_IOFree(outFile);
return 0;
}
```

View File

@@ -92,6 +92,9 @@ size_t BH_UnicodeEncodeUtf8(uint32_t unit,
{ {
size_t result; size_t result;
if (unit > 0xD7FF && unit < 0xE000)
return 0;
result = 0; result = 0;
if (unit < 0x80ul) if (unit < 0x80ul)
{ {
@@ -199,6 +202,9 @@ size_t BH_UnicodeDecodeUtf16BE(const char *string,
size_t BH_UnicodeEncodeUtf16LE(uint32_t unit, size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
char *string) char *string)
{ {
if (unit > 0xD7FF && unit < 0xE000)
return 0;
if (unit < 0x10000) if (unit < 0x10000)
{ {
BH_Write16LEu(string, unit); BH_Write16LEu(string, unit);
@@ -218,6 +224,9 @@ size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
size_t BH_UnicodeEncodeUtf16BE(uint32_t unit, size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
char *string) char *string)
{ {
if (unit > 0xD7FF && unit < 0xE000)
return 0;
if (unit < 0x10000) if (unit < 0x10000)
{ {
BH_Write16BEu(string, unit); BH_Write16BEu(string, unit);
@@ -261,7 +270,7 @@ size_t BH_UnicodeDecodeUtf32BE(const char *string,
size_t BH_UnicodeEncodeUtf32LE(uint32_t unit, size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
char *string) char *string)
{ {
if (unit > 0x1FFFFF) if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
return 0; return 0;
BH_Write32LEu(string, unit); BH_Write32LEu(string, unit);
@@ -272,7 +281,7 @@ size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
size_t BH_UnicodeEncodeUtf32BE(uint32_t unit, size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
char *string) char *string)
{ {
if (unit > 0x1FFFFF) if (unit > 0x1FFFFF || (unit > 0xD7FF && unit < 0xE000))
return 0; return 0;
BH_Write32BEu(string, unit); BH_Write32BEu(string, unit);

225
test/src/TestUnicode.c Normal file
View File

@@ -0,0 +1,225 @@
#include <BH/Unit.h>
#include <BH/String.h>
#include <BH/IO.h>
#include <stdlib.h>
struct TestCase
{
char *input;
size_t size;
size_t read;
uint32_t result;
};
BH_UNIT_TEST(Case)
{
size_t i, j;
for (i = 0; i < 0x110000; i++)
{
j = BH_UnicodeLower(i);
if (j == i)
{
j = BH_UnicodeUpper(i);
j = BH_UnicodeLower(j);
}
else
j = BH_UnicodeUpper(j);
/* Some exceptions */
if (i == 0x130 && j == 0x49)
continue;
else if (i == 0x131 && j == 0x69)
continue;
else if (i == 0x1C5 && j == 0x1C4)
continue;
else if (i == 0x1C8 && j == 0x1C7)
continue;
else if (i == 0x1CB && j == 0x1CA)
continue;
BH_VERIFY(i == j);
}
return 0;
}
BH_UNIT_TEST(Utf8)
{
const struct TestCase *current;
const struct TestCase cases[] =
{
/* Normal cases */
{"\x00", 1, 1, 0},
{"\xC2\x80", 2, 2, 0x80},
{"\xE0\xA0\x80", 3, 3, 0x800},
{"\xF0\x90\x80\x80", 4, 4, 0x10000},
{"\x7F", 1, 1, 0x7F},
{"\xDF\xBF", 2, 2, 0x7FF},
{"\xEF\xBF\xBF", 3, 3, 0xFFFF},
{"\xED\x9F\xBF", 3, 3, 0xD7FF},
{"\xEE\x80\x80", 3, 3, 0xE000},
{"\xEF\xBF\xBD", 3, 3, 0xFFFD},
{"H", 1, 1, 'H'},
{"\xCE\xBA", 2, 2, 0x3BA},
/* Lonely start characters */
{"\xC0 ", 2, 1, -1},
{"\xC1 ", 2, 1, -1},
{"\xC2 ", 2, 1, -1},
{"\xC3 ", 2, 1, -1},
{"\xC4 ", 2, 1, -1},
/* Malformed sequences */
{"\x80", 1, -1, -1},
{"\xBF", 1, -1, -1},
{"\xFE", 1, -1, -1},
{"\xFF", 1, -1, -1},
/* Overlong sequences */
{"\xC0\xAF", 2, 1, -1},
/* UTF-16 surrogate pairs */
{"\xED\xA0\x80", 3, -1, -1},
{"\xED\xAE\x80", 3, -1, -1},
{"\xED\xB0\x80", 3, -1, -1},
{"\xED\xBF\xBF", 3, -1, -1},
{NULL, 0, 0, 0}
};
char buffer[8];
uint32_t unit;
size_t i, outSize, inSize;
/* Encode and decode all character in a valid UTF-8 range */
for (i = 0; i < 0x110000; i++)
{
inSize = BH_UnicodeEncodeUtf8(i, buffer);
/* Check for surrogate pairs */
if (i > 0xD7FF && i < 0xE000)
{
BH_VERIFY(inSize == 0);
continue;
}
BH_VERIFY(inSize > 0);
outSize = BH_UnicodeDecodeUtf8(buffer, inSize, &unit);
BH_VERIFY(inSize == outSize);
BH_VERIFY(unit == i);
}
/* Test special cases */
for (current = cases; current->input; current++)
{
i = BH_UnicodeDecodeUtf8(current->input, current->size, &unit);
if (current->read == (size_t)-1 && i)
i = -1;
if (i != current->read || unit != current->result)
{
printf("\tcase %d\n", (int)(current - cases));
BH_VERIFY(i == current->read);
BH_VERIFY(unit == current->result);
}
}
return 0;
}
BH_UNIT_TEST(Utf16)
{
char buffer[8];
uint32_t unit;
size_t i, outSize, inSize;
/* Encode and decode all character in a valid UTF-8 range */
for (i = 0; i < 0x110000; i++)
{
/* Check for little endian */
inSize = BH_UnicodeEncodeUtf16LE(i, buffer);
if (i > 0xD7FF && i < 0xE000)
{
BH_VERIFY(inSize == 0);
continue;
}
BH_VERIFY(inSize > 0);
outSize = BH_UnicodeDecodeUtf16LE(buffer, inSize, &unit);
BH_VERIFY(inSize == outSize);
BH_VERIFY(unit == i);
/* Check for big endian */
inSize = BH_UnicodeEncodeUtf16BE(i, buffer);
if (i > 0xD7FF && i < 0xE000)
{
BH_VERIFY(inSize == 0);
continue;
}
BH_VERIFY(inSize > 0);
outSize = BH_UnicodeDecodeUtf16BE(buffer, inSize, &unit);
BH_VERIFY(inSize == outSize);
BH_VERIFY(unit == i);
}
return 0;
}
BH_UNIT_TEST(Utf32)
{
char buffer[8];
uint32_t unit;
size_t i, outSize, inSize;
/* Encode and decode all character in a valid UTF-8 range */
for (i = 0; i < 0x110000; i++)
{
/* Check for little endian */
inSize = BH_UnicodeEncodeUtf32LE(i, buffer);
if (i > 0xD7FF && i < 0xE000)
{
BH_VERIFY(inSize == 0);
continue;
}
BH_VERIFY(inSize > 0);
outSize = BH_UnicodeDecodeUtf32LE(buffer, inSize, &unit);
BH_VERIFY(inSize == outSize);
BH_VERIFY(unit == i);
/* Check for big endian */
inSize = BH_UnicodeEncodeUtf32BE(i, buffer);
if (i > 0xD7FF && i < 0xE000)
{
BH_VERIFY(inSize == 0);
continue;
}
BH_VERIFY(inSize > 0);
outSize = BH_UnicodeDecodeUtf32BE(buffer, inSize, &unit);
BH_VERIFY(inSize == outSize);
BH_VERIFY(unit == i);
}
return 0;
}
int main(int argc, char **argv)
{
BH_UNUSED(argc);
BH_UNUSED(argv);
BH_UNIT_ADD(Case);
BH_UNIT_ADD(Utf8);
BH_UNIT_ADD(Utf16);
BH_UNIT_ADD(Utf32);
return BH_UnitRun();
}

View File

@@ -1,3 +1,3 @@
#!/bin/sh #!/bin/sh
find . \( -iname "*.h" -o -iname "*.c" \) -exec sed -i .bak "s/[ ]*$//" {} \; find . \( -iname "*.h" -o -iname "*.c" -o -iname "*.md" \) -exec sed -i .bak "s/[ ]*$//" {} \;
find . -iname "*.bak" -exec rm {} \; find . -iname "*.bak" -exec rm {} \;