#include "text.h" u64 utf8_codepoint_count(const char *s) { u64 count = 0; while(*s) { if(utf8_is_codepoint_start(s)) count++; s++; } return count; } bool utf8_is_codepoint_start(const char *s) { return (*s & 0b11000000) != 0b10000000; } u32 utf8_codepoint_bytes(const char *s) { if(!utf8_is_codepoint_start(s)) return 0; // Error: This byte belongs to a previous codepoint if((*s & 0b10000000) == 0b00000000) //1 byte codepoint return 1; else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint return 2; else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint return 3; else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint return 4; return 0; } /* If bytes_read returns 0, we either reached the end of the string or there was a decoding error */ utf8_codepoint utf8_extract_codepoint(const char *s, u64 current_index, u32 *bytes_read) { s += current_index; // UTF8: // First byte: (0xxxxxxx = 1 byte, 110xxxxx = 2 byte, 1110xxxx = 3 byte, 11110xxx = 4 byte) // Next bytes: 10xxxxxx // To get a Codepoint: concatenate all the xxxx utf8_codepoint codepoint = 0; *bytes_read = 0; u8 next_bytes = 0; if(!utf8_is_codepoint_start(s)) { // Error: This byte belongs to a previous codepoint return 0; } if((*s & 0b10000000) == 0b00000000) //1 byte codepoint { codepoint = *s; next_bytes = 0; } else if((*s & 0b11100000) == 0b11000000) //2 bytes codepoint { codepoint = (*s & 0b00011111); next_bytes = 1; } else if((*s & 0b11110000) == 0b11100000) //3 bytes codepoint { codepoint = (*s & 0b00001111); next_bytes = 2; } else if((*s & 0b11111000) == 0b11110000) //4 bytes codepoint { codepoint = (*s & 0b00000111); next_bytes = 3; } for(u8 i = 0; i < next_bytes; i++) { s++; if(*s == 0) { // Error: End of string reached before completing codepoint return 0; } if((*s & 0b11000000) != 0b10000000) { // Error: Byte prefix does not match with the expected one. Broken codepoint return 0; } codepoint = codepoint << 6; codepoint |= (*s & 0b00111111); } *bytes_read = next_bytes + 1; return codepoint; } u32 utf8_bytes_to_next_valid_codepoint(const char *s, u64 current_index) { s += current_index; u64 bytes = 1; while(*(s + bytes)) { if(utf8_is_codepoint_start(s + bytes)) break; bytes++; } return bytes; } u32 utf8_bytes_to_prev_valid_codepoint(const char *s, u64 current_index) { s += current_index; u64 bytes = 0; while(bytes < current_index) { bytes++; if(utf8_is_codepoint_start(s - bytes)) break; } return bytes; } u64 utf8_from_string(const char *s, u64 *bytes_read, utf8_codepoint *result, u64 result_size) { u64 decoded = 0; bytes_read = 0; while(*s && decoded < result_size) { u32 read = 0; result[decoded] = utf8_extract_codepoint(s, 0, &read); if(read == 0) { bytes_read = 0; break; } s += read; bytes_read += read; decoded++; } return decoded; } u64 utf8_to_string(utf8_codepoint *codepoints, u64 count, char *result, u64 result_size) { result_size--; // Reserve space for zero-terminator u64 i = 0; u64 result_i = 0; for(i = 0; i < count; i++) { utf8_codepoint cp = codepoints[i]; if((cp & 0xFFFFFF80) == 0) // 1 byte { if(result_i + 1 >= result_size) // Not enought space left break; result[result_i++] = cp & 0b01111111; } else if((cp & 0xFFFFF800) == 0) // 2 bytes { if(result_i + 2 >= result_size) // Not enought space left break; result[result_i++] = 0b11000000 | ((cp >> 6) & 0b00011111); result[result_i++] = 0b10000000 | ((cp ) & 0b00111111); } else if((cp & 0xFFFF0000) == 0) // 3 bytes { if(result_i + 3 >= result_size) // Not enought space left break; result[result_i++] = 0b11100000 | ((cp >> 12) & 0b00001111); result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111); result[result_i++] = 0b10000000 | ((cp ) & 0b00111111); } else if((cp & 0xFFE00000) == 0) // 4 bytes { if(result_i + 4 >= result_size) // Not enought space left break; result[result_i++] = 0b11110000 | ((cp >> 18) & 0b00000111); result[result_i++] = 0b10000000 | ((cp >> 12) & 0b00111111); result[result_i++] = 0b10000000 | ((cp >> 6) & 0b00111111); result[result_i++] = 0b10000000 | ((cp ) & 0b00111111); } else { // Invalid codepoint break; } } result[result_i] = 0; return i; }