[ttssh2-commit] [10325] 同じ Unicode block にある文字だけを、ヴィラーマによって結合するようにした

Back to archive index
scmno****@osdn***** scmno****@osdn*****
2022年 10月 18日 (火) 22:01:25 JST


Revision: 10325
          https://osdn.net/projects/ttssh2/scm/svn/commits/10325
Author:   nmaya
Date:     2022-10-18 22:01:25 +0900 (Tue, 18 Oct 2022)
Log Message:
-----------
同じ Unicode block にある文字だけを、ヴィラーマによって結合するようにした

Unicode block のテーブルを作成するスクリプトを追加した
ticket #44424

Ticket Links:
------------
    https://osdn.net/projects/ttssh2/tracker/detail/44424

Modified Paths:
--------------
    trunk/teraterm/teraterm/buffer.c
    trunk/teraterm/teraterm/unicode/readme.md
    trunk/teraterm/teraterm/unicode.cpp
    trunk/teraterm/teraterm/unicode.h

Added Paths:
-----------
    trunk/teraterm/teraterm/unicode/get_block_table.md
    trunk/teraterm/teraterm/unicode/get_block_table.pl

-------------- next part --------------
Modified: trunk/teraterm/teraterm/buffer.c
===================================================================
--- trunk/teraterm/teraterm/buffer.c	2022-10-17 15:19:42 UTC (rev 10324)
+++ trunk/teraterm/teraterm/buffer.c	2022-10-18 13:01:25 UTC (rev 10325)
@@ -2740,7 +2740,16 @@
 
 	// \x83\x94\x83B\x83\x89\x81[\x83}\x8F\x88\x97\x9D
 	if (UnicodeIsVirama(p->u32_last) != 0) {
-		return p;
+		// 1\x82‘O\x82̃\x94\x83B\x83\x89\x81[\x83}\x82Ɠ\xAF\x82\xB6 block \x82̕\xB6\x8E\x9A\x82ł\xA0\x82\xE9
+		int block_index_last = UnicodeBlockIndex(p->u32_last);
+		int block_index = UnicodeBlockIndex(u32);
+#if 0
+		OutputDebugPrintf("U+%06x, %d, %s\n", p->u32_last, block_index_last, UnicodeBlockName(block_index_last));
+		OutputDebugPrintf("U+%06x, %d, %s\n", u32, block_index, UnicodeBlockName(block_index));
+#endif
+		if (block_index_last == block_index) {
+			return p;
+		}
 	}
 	return NULL;
 }

Added: trunk/teraterm/teraterm/unicode/get_block_table.md
===================================================================
--- trunk/teraterm/teraterm/unicode/get_block_table.md	                        (rev 0)
+++ trunk/teraterm/teraterm/unicode/get_block_table.md	2022-10-18 13:01:25 UTC (rev 10325)
@@ -0,0 +1,16 @@
+# unicode の block
+
+- 元情報 unicode.org
+  - https://www.unicode.org/reports/tr44/#Blocks.txt
+
+# テーブルの作り方
+
+- Blocks.txt をダウンロード
+- スクリプトを実行
+- unicode_block.tbl が出力される
+
+実行例
+```
+wget https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt
+perl get_block_table.pl
+```

Added: trunk/teraterm/teraterm/unicode/get_block_table.pl
===================================================================
--- trunk/teraterm/teraterm/unicode/get_block_table.pl	                        (rev 0)
+++ trunk/teraterm/teraterm/unicode/get_block_table.pl	2022-10-18 13:01:25 UTC (rev 10325)
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use utf8;
+
+my $src_file = "Blocks.txt";
+my $fname_out = "unicode_block.tbl";
+
+open(FILE, $src_file) || die "Cannot open $src_file.";
+open(OUT, ">:crlf:utf8", $fname_out) || die "Cannot open $fname_out.";
+print OUT "// this file was generated by get_block_table.pl\n";
+
+my $v = <FILE>;
+chop($v);
+print OUT "// $v\n";
+$v = <FILE>;
+chop($v);
+print OUT "// $v\n";
+
+my $ostart = -1;
+my $otype = "";
+my $oend = 0;
+my $type;
+while(my $a = <FILE>) {
+	if ($a =~ /^([0-9A-F]+)\.\.([0-9A-F]+); ([- 0-9A-Za-z]+)/) {
+		my $start = hex $1;
+		my $end = hex $2;
+		my $name = $3;
+		printf(OUT "{ 0x%06x, 0x%06x, \"$name\" },\n", $start, $end);
+	} else {
+		next;
+	}
+}

Modified: trunk/teraterm/teraterm/unicode/readme.md
===================================================================
--- trunk/teraterm/teraterm/unicode/readme.md	2022-10-17 15:19:42 UTC (rev 10324)
+++ trunk/teraterm/teraterm/unicode/readme.md	2022-10-18 13:01:25 UTC (rev 10325)
@@ -44,6 +44,11 @@
 - 絵文字判定のためのテーブル
 - [get_emoji_table.md](get_emoji_table.md)
 
+## [unicode_block.tbl](../unicode_block.tbl)
+
+- Unicode block のテーブル
+- [get_block_table.md](get_block_table.md)
+
 ## iso8859-X.md
 
 - [iso8859.md](iso8859.md)

Modified: trunk/teraterm/teraterm/unicode.cpp
===================================================================
--- trunk/teraterm/teraterm/unicode.cpp	2022-10-17 15:19:42 UTC (rev 10324)
+++ trunk/teraterm/teraterm/unicode.cpp	2022-10-18 13:01:25 UTC (rev 10325)
@@ -100,6 +100,16 @@
 	unsigned char category;
 } UnicodeTableCombine_t;
 
+typedef struct {
+	unsigned long code_from;
+	unsigned long code_to;
+	char *block_name;
+} UnicodeTableBlock_t;
+
+const UnicodeTableBlock_t UnicodeBlockList[] = {
+#include "unicode_block.tbl"
+};
+
 /**
  * u32\x82\xAA\x83e\x81[\x83u\x83\x8B\x82̃f\x81[\x83^\x82Ɋ܂܂\xEA\x82Ă\xA2\x82邩\x92\xB2\x82ׂ\xE9
  *
@@ -165,6 +175,39 @@
 	return -1;
 }
 
+/**
+ *	SearchTableSimple() \x82Ɠ\xAF\x82\xB6
+ *	\x83e\x81[\x83u\x83\x8B\x82̌^\x82\xAA\x88قȂ\xE9
+ *
+ *	@retval		\x83e\x81[\x83u\x83\x8B\x82\xCCindex
+ *	@retval		-1 \x83e\x81[\x83u\x83\x8B\x82ɑ\xB6\x8D݂\xB5\x82Ȃ\xA2
+ */
+static int SearchTableBlock(
+	const UnicodeTableBlock_t *table, size_t table_size,
+	unsigned long u32)
+{
+	if (u32 < table[0].code_from) {
+		return -1;
+	}
+	if (u32 > table[table_size-1].code_to) {
+		return -1;
+	}
+	size_t low = 0;
+	size_t high = table_size - 1;
+	while (low <= high) {
+		size_t mid = (low + high) / 2;
+		if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) {
+			return (int)mid;
+		} else if (table[mid].code_to < u32) {
+			low = mid + 1;
+		} else {
+			high = mid - 1;
+		}
+	}
+	// \x83e\x81[\x83u\x83\x8B\x82͈̔͊O
+	return -1;
+}
+
 /*
  * \x8C\x8B\x8D\x87\x95\xB6\x8E\x9A\x82\xA9\x8C\x9F\x8D\xB8\x82\xB7\x82\xE9
  *		\x8E\x9F\x82̕\xB6\x8E\x9A\x82\xE0\x8C\x8B\x8D\x87\x95\xB6\x8E\x9A\x82Ƃ\xB5\x82Ĉ\xB5\x82\xA4
@@ -244,7 +287,25 @@
 	return index != -1 ? 1 : 0;
 }
 
+/**
+ *	Unicode block \x82\xCC index \x82𓾂\xE9
+ *
+ *	@retval	-1	block \x82\xAA\x8C\xA9\x82‚\xA9\x82\xE7\x82Ȃ\xA2
+ *	@retval		block \x82\xCC index
+ */
+int UnicodeBlockIndex(unsigned long u32)
+{
+	return SearchTableBlock(UnicodeBlockList, _countof(UnicodeBlockList), u32);
+}
 
+char *UnicodeBlockName(int index)
+{
+	if (index == -1) {
+		return "";
+	}
+	return UnicodeBlockList[index].block_name;
+}
+
 #if 0
 int main(int, char *[])
 {

Modified: trunk/teraterm/teraterm/unicode.h
===================================================================
--- trunk/teraterm/teraterm/unicode.h	2022-10-17 15:19:42 UTC (rev 10324)
+++ trunk/teraterm/teraterm/unicode.h	2022-10-18 13:01:25 UTC (rev 10325)
@@ -40,6 +40,8 @@
 int UnicodeFromISO8859(int part, unsigned char b, unsigned short *u16);
 int UnicodeToISO8859(int part, unsigned long u32, unsigned char *b);
 int UnicodeIsVirama(unsigned long u32);
+int UnicodeBlockIndex(unsigned long u32);
+char *UnicodeBlockName(int);
 
 #ifdef __cplusplus
 }


ttssh2-commit メーリングリストの案内
Back to archive index