197 lines
4.3 KiB
C++
197 lines
4.3 KiB
C++
|
|
#include "text.h"
|
||
|
|
|
||
|
|
u64 utf8_codepoint_count(const char *s)
|
||
|
|
{
|
||
|
|
u64 count = 0;
|
||
|
|
while(*s)
|
||
|
|
{
|
||
|
|
if(utf8_is_codepoint_start(s))
|
||
|
|
count++;
|
||
|
|
s++;
|
||
|
|
}
|
||
|
|
return count;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool utf8_is_codepoint_start(const char *s)
|
||
|
|
{
|
||
|
|
return (*s & 0b11000000) != 0b10000000;
|
||
|
|
}
|
||
|
|
|
||
|
|
u32 utf8_codepoint_bytes(const char *s)
|
||
|
|
{
|
||
|
|
if(!utf8_is_codepoint_start(s))
|
||
|
|
return 0; // Error: This byte belongs to a previous codepoint
|
||
|
|
|
||
|
|
if((*s & 0b10000000) == 0b00000000) //1 byte codepoint
|
||
|
|
return 1;
|
||
|
|
else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint
|
||
|
|
return 2;
|
||
|
|
else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint
|
||
|
|
return 3;
|
||
|
|
else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint
|
||
|
|
return 4;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* If bytes_read returns 0, we either reached the end of the string or there was a decoding error */
|
||
|
|
utf8_codepoint utf8_extract_codepoint(const char *s, u64 current_index, u32 *bytes_read)
|
||
|
|
{
|
||
|
|
s += current_index;
|
||
|
|
// UTF8:
|
||
|
|
// First byte: (0xxxxxxx = 1 byte, 110xxxxx = 2 byte, 1110xxxx = 3 byte, 11110xxx = 4 byte)
|
||
|
|
// Next bytes: 10xxxxxx
|
||
|
|
// To get a Codepoint: concatenate all the xxxx
|
||
|
|
utf8_codepoint codepoint = 0;
|
||
|
|
*bytes_read = 0;
|
||
|
|
u8 next_bytes = 0;
|
||
|
|
|
||
|
|
if(!utf8_is_codepoint_start(s))
|
||
|
|
{
|
||
|
|
// Error: This byte belongs to a previous codepoint
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if((*s & 0b10000000) == 0b00000000) //1 byte codepoint
|
||
|
|
{
|
||
|
|
codepoint = *s;
|
||
|
|
next_bytes = 0;
|
||
|
|
}
|
||
|
|
else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint
|
||
|
|
{
|
||
|
|
codepoint = (*s & 0b00011111);
|
||
|
|
next_bytes = 1;
|
||
|
|
}
|
||
|
|
else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint
|
||
|
|
{
|
||
|
|
codepoint = (*s & 0b00001111);
|
||
|
|
next_bytes = 2;
|
||
|
|
}
|
||
|
|
else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint
|
||
|
|
{
|
||
|
|
codepoint = (*s & 0b00000111);
|
||
|
|
next_bytes = 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
for(u8 i = 0; i < next_bytes; i++)
|
||
|
|
{
|
||
|
|
s++;
|
||
|
|
if(*s == 0)
|
||
|
|
{
|
||
|
|
// Error: End of string reached before completing codepoint
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if((*s & 0b11000000) != 0b10000000)
|
||
|
|
{
|
||
|
|
// Error: Byte prefix does not match with the expected one. Broken codepoint
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
codepoint = codepoint << 6;
|
||
|
|
codepoint |= (*s & 0b00111111);
|
||
|
|
}
|
||
|
|
*bytes_read = next_bytes + 1;
|
||
|
|
|
||
|
|
return codepoint;
|
||
|
|
}
|
||
|
|
|
||
|
|
u32 utf8_bytes_to_next_valid_codepoint(const char *s, u64 current_index)
|
||
|
|
{
|
||
|
|
s += current_index;
|
||
|
|
u64 bytes = 1;
|
||
|
|
while(*(s + bytes))
|
||
|
|
{
|
||
|
|
if(utf8_is_codepoint_start(s + bytes))
|
||
|
|
break;
|
||
|
|
bytes++;
|
||
|
|
}
|
||
|
|
return bytes;
|
||
|
|
}
|
||
|
|
|
||
|
|
u32 utf8_bytes_to_prev_valid_codepoint(const char *s, u64 current_index)
|
||
|
|
{
|
||
|
|
s += current_index;
|
||
|
|
u64 bytes = 0;
|
||
|
|
while(bytes < current_index)
|
||
|
|
{
|
||
|
|
bytes++;
|
||
|
|
if(utf8_is_codepoint_start(s - bytes))
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
return bytes;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
u64 utf8_from_string(const char *s, u64 *bytes_read, utf8_codepoint *result, u64 result_size)
|
||
|
|
{
|
||
|
|
u64 decoded = 0;
|
||
|
|
bytes_read = 0;
|
||
|
|
while(*s && decoded < result_size)
|
||
|
|
{
|
||
|
|
u32 read = 0;
|
||
|
|
result[decoded] = utf8_extract_codepoint(s, 0, &read);
|
||
|
|
if(read == 0)
|
||
|
|
{
|
||
|
|
bytes_read = 0;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
s += read;
|
||
|
|
bytes_read += read;
|
||
|
|
decoded++;
|
||
|
|
}
|
||
|
|
|
||
|
|
return decoded;
|
||
|
|
}
|
||
|
|
|
||
|
|
u64 utf8_to_string(utf8_codepoint *codepoints, u64 count, char *result, u64 result_size)
|
||
|
|
{
|
||
|
|
result_size--; // Reserve space for zero-terminator
|
||
|
|
u64 i = 0;
|
||
|
|
u64 result_i = 0;
|
||
|
|
for(i = 0; i < count; i++)
|
||
|
|
{
|
||
|
|
utf8_codepoint cp = codepoints[i];
|
||
|
|
if((cp & 0xFFFFFF80) == 0) // 1 byte
|
||
|
|
{
|
||
|
|
if(result_i + 1 >= result_size) // Not enought space left
|
||
|
|
break;
|
||
|
|
|
||
|
|
result[result_i++] = cp & 0b01111111;
|
||
|
|
}
|
||
|
|
else if((cp & 0xFFFFF800) == 0) // 2 bytes
|
||
|
|
{
|
||
|
|
if(result_i + 2 >= result_size) // Not enought space left
|
||
|
|
break;
|
||
|
|
|
||
|
|
result[result_i++] = 0b11000000 | ((cp >> 6) & 0b00011111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
|
||
|
|
}
|
||
|
|
else if((cp & 0xFFFF0000) == 0) // 3 bytes
|
||
|
|
{
|
||
|
|
if(result_i + 3 >= result_size) // Not enought space left
|
||
|
|
break;
|
||
|
|
|
||
|
|
result[result_i++] = 0b11100000 | ((cp >> 12) & 0b00001111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
|
||
|
|
}
|
||
|
|
else if((cp & 0xFFE00000) == 0) // 4 bytes
|
||
|
|
{
|
||
|
|
if(result_i + 4 >= result_size) // Not enought space left
|
||
|
|
break;
|
||
|
|
|
||
|
|
result[result_i++] = 0b11110000 | ((cp >> 18) & 0b00000111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp >> 12) & 0b00111111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111);
|
||
|
|
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// Invalid codepoint
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
result[result_i] = 0;
|
||
|
|
return i;
|
||
|
|
}
|