Files
Server_Monitor/code/lib/text.cpp

197 lines
4.3 KiB
C++
Raw Normal View History

2023-09-26 19:40:16 +02:00
#include "text.h"
u64 utf8_codepoint_count(const char *s)
{
u64 count = 0;
while(*s)
{
if(utf8_is_codepoint_start(s))
count++;
s++;
}
return count;
}
bool utf8_is_codepoint_start(const char *s)
{
return (*s & 0b11000000) != 0b10000000;
}
u32 utf8_codepoint_bytes(const char *s)
{
if(!utf8_is_codepoint_start(s))
return 0; // Error: This byte belongs to a previous codepoint
if((*s & 0b10000000) == 0b00000000) //1 byte codepoint
return 1;
else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint
return 2;
else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint
return 3;
else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint
return 4;
return 0;
}
/* If bytes_read returns 0, we either reached the end of the string or there was a decoding error */
utf8_codepoint utf8_extract_codepoint(const char *s, u64 current_index, u32 *bytes_read)
{
s += current_index;
// UTF8:
// First byte: (0xxxxxxx = 1 byte, 110xxxxx = 2 byte, 1110xxxx = 3 byte, 11110xxx = 4 byte)
// Next bytes: 10xxxxxx
// To get a Codepoint: concatenate all the xxxx
utf8_codepoint codepoint = 0;
*bytes_read = 0;
u8 next_bytes = 0;
if(!utf8_is_codepoint_start(s))
{
// Error: This byte belongs to a previous codepoint
return 0;
}
if((*s & 0b10000000) == 0b00000000) //1 byte codepoint
{
codepoint = *s;
next_bytes = 0;
}
else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint
{
codepoint = (*s & 0b00011111);
next_bytes = 1;
}
else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint
{
codepoint = (*s & 0b00001111);
next_bytes = 2;
}
else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint
{
codepoint = (*s & 0b00000111);
next_bytes = 3;
}
for(u8 i = 0; i < next_bytes; i++)
{
s++;
if(*s == 0)
{
// Error: End of string reached before completing codepoint
return 0;
}
if((*s & 0b11000000) != 0b10000000)
{
// Error: Byte prefix does not match with the expected one. Broken codepoint
return 0;
}
codepoint = codepoint << 6;
codepoint |= (*s & 0b00111111);
}
*bytes_read = next_bytes + 1;
return codepoint;
}
u32 utf8_bytes_to_next_valid_codepoint(const char *s, u64 current_index)
{
s += current_index;
u64 bytes = 1;
while(*(s + bytes))
{
if(utf8_is_codepoint_start(s + bytes))
break;
bytes++;
}
return bytes;
}
u32 utf8_bytes_to_prev_valid_codepoint(const char *s, u64 current_index)
{
s += current_index;
u64 bytes = 0;
while(bytes < current_index)
{
bytes++;
if(utf8_is_codepoint_start(s - bytes))
break;
}
return bytes;
}
u64 utf8_from_string(const char *s, u64 *bytes_read, utf8_codepoint *result, u64 result_size)
{
u64 decoded = 0;
bytes_read = 0;
while(*s && decoded < result_size)
{
u32 read = 0;
result[decoded] = utf8_extract_codepoint(s, 0, &read);
if(read == 0)
{
bytes_read = 0;
break;
}
s += read;
bytes_read += read;
decoded++;
}
return decoded;
}
u64 utf8_to_string(utf8_codepoint *codepoints, u64 count, char *result, u64 result_size)
{
result_size--; // Reserve space for zero-terminator
u64 i = 0;
u64 result_i = 0;
for(i = 0; i < count; i++)
{
utf8_codepoint cp = codepoints[i];
if((cp & 0xFFFFFF80) == 0) // 1 byte
{
if(result_i + 1 >= result_size) // Not enought space left
break;
result[result_i++] = cp & 0b01111111;
}
else if((cp & 0xFFFFF800) == 0) // 2 bytes
{
if(result_i + 2 >= result_size) // Not enought space left
break;
result[result_i++] = 0b11000000 | ((cp >> 6) & 0b00011111);
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
}
else if((cp & 0xFFFF0000) == 0) // 3 bytes
{
if(result_i + 3 >= result_size) // Not enought space left
break;
result[result_i++] = 0b11100000 | ((cp >> 12) & 0b00001111);
result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111);
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
}
else if((cp & 0xFFE00000) == 0) // 4 bytes
{
if(result_i + 4 >= result_size) // Not enought space left
break;
result[result_i++] = 0b11110000 | ((cp >> 18) & 0b00000111);
result[result_i++] = 0b10000000 | ((cp >> 12) & 0b00111111);
result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111);
result[result_i++] = 0b10000000 | ((cp ) & 0b00111111);
}
else
{
// Invalid codepoint
break;
}
}
result[result_i] = 0;
return i;
}