misc/codepoint_width: add unicode width detection support

Add 4 stage trie to lookup unicode codepoint width and grapheme join
rules.

Generated by GraphemeTableGen from Microsoft Terminal (MIT Licence):
a7e47b711a/src/tools/GraphemeTableGen/Program.cs

With minor adjustment to use it in C codebase.
- Replaced constexpr with static
- Replaced auto with explicit types

Generated from Unicode 16.0.0:
ucd.nounihan.grouped.xml: sha256(b11c2d23673bae660fff8ddcd3c1de4d54bdf6c60188a07696b010282f515fcf)
This commit is contained in:
Kacper Michajłow
2024-09-21 16:32:27 +02:00
parent fa77ad46cb
commit 95f0046309
6 changed files with 823 additions and 34 deletions

View File

@@ -25,14 +25,13 @@
#include "mpv_talloc.h"
#include "misc/bstr.h"
#include "common/common.h"
#include "common/global.h"
#include "misc/bstr.h"
#include "misc/codepoint_width.h"
#include "options/options.h"
#include "options/path.h"
#include "osdep/terminal.h"
#include "osdep/io.h"
#include "osdep/terminal.h"
#include "osdep/threads.h"
#include "osdep/timer.h"
@@ -374,37 +373,6 @@ static bool test_terminal_level(struct mp_log *log, int lev)
!(lev == MSGL_STATUS && terminal_in_background());
}
// This is very basic way to infer needed width for a string.
static int term_disp_width(bstr str)
{
int width = 0;
while (str.len) {
if (bstr_eatstart0(&str, "\033[")) {
while (str.len && !((*str.start >= '@' && *str.start <= '~') || *str.start == 'm'))
str = bstr_cut(str, 1);
str = bstr_cut(str, 1);
continue;
}
bstr code = bstr_split_utf8(str, &str);
if (code.len == 0)
return 0;
if (code.len == 1 && *code.start == '\n')
continue;
// Only single-width characters are supported
width++;
// Assume that everything before \r should be discarded for simplicity
if (code.len == 1 && *code.start == '\r')
width = 0;
}
return width;
}
static void append_terminal_line(struct mp_log *log, int lev,
bstr text, bstr *term_msg, int *line_w)
{