package unicodeconv;
# unicodeconv.pl - Unicode , UTF-8 convert to each other and to ShiftJIS code
# Jcode.pm使えという突っ込みは無しの方向で。(ぉ
# BigEndian -> BE, LittleEndian -> LE と省略しています。
# Byte Order Mark(BOM) は考慮していません。(というか含まれているとたぶん誤作動します。
# 変換テーブルに使われているのはWindows用の物です。適宜置き換えてください。(ですが、他のテーブルでの変換は未確認です。)
# コーディングに際して、以下のサイトに非常にお世話になりました。こんな場所でですが厚く御礼申し上げます。
# http://homepage3.nifty.com/hippo2000/
# http://homepage1.nifty.com/nomenclator/unicode/
# http://www.debian.or.jp/~kubota/
# そして、何度も初歩的な質問に答えて頂いた、WinMX上のチャット部屋の「初心者の部屋@最低人間」のShimizさん、mozさん、最低人間さん、
# 常連の皆様方に感謝いたします。
# このコードは未保証です。ある程度はテストしていますがバグがあるかもしれません。
# このコードの私に著作権が帰属する部分についての利用に関して、私が持つ権利を全て放棄します。
# インターフェース
# ○文字コードを変換します。
# ex. $str = &unicodeconv::utf2sjis ($buf);
# UTF-8、UnicodeBE、UnicodeLE、ShiftJISを相互に変換出来ます。
# sjis2uBE sjis2uLE sjis2utf ShiftSJISからの変換。
# uBE2sjis uBE2uLE uBE2utf UnicodeBEからの変換。
# uLE2sjis uLE2uBE uLE2utf UnicodeLEからの変換。
# utf2sjis utf2uBE utf2uLE UTF-8からの変換。
# ○文字列を16進数表示の数字列に、或いはその逆を行います。
# ex. $ascii = &unicodeconv::toascii ($str);
# toascii 'ABCDEFG' -> '41424344454647'
# tobinary '41424344454647' -> 'ABCDEFG'
my $ver = '0.1';
my %table2sjis; # UnicodeBE -> ShiftJIS convert table
my %table2uBE; # ShiftJIS -> UnicodeBE convert table
unless ($unicodeconvver) { # UnicodeBE <-> SJIS Table
my $tabletxt = './lib/CP932.TXT'; # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
my $sjis, $uBE;
my $err = 0;
open (TABLE, "<$tabletxt") or die "error while opening $tabletxt";
while (
) {
if (/^#/) { next; }
($sjis, $uBE) = split;
$sjis = substr ($sjis, 2);
$uBE = substr ($uBE, 2);
if ($table2sjis{$uBE}) { next; } # 外字で再定義されているのを無視
$table2sjis{$uBE} = $sjis;
$table2uBE{$sjis} = $uBE;
}
unless ($err) {
$unicodeconvver = "unicodeconv.pl, ver $ver";
}
}
sub sjis2uBE
{
my ($sjis, undef) = @_;
if ($err) { return $sjis; }
my $uBE = "";
$sjis = &toascii ($sjis);
$uBE = &_sjis2uBE ($sjis);
$uBE = &tobinary ($uBE);
return $uBE;
}
sub _sjis2uBE
{
my ($sjis, undef) = @_;
if ($err) { return $sjis; }
my $uBE = "";
my $code;
my $buf;
while ($sjis) {
$code = substr ($sjis, 0, 2);
$buf = hex ($code);
if ((0x80 < $buf && $buf < 0xA0) || 0xDF < $buf) {
$sjis = substr ($sjis, 2);
$code .= substr ($sjis, 0, 2);
}
$uBE .= $table2uBE{$code};
$sjis = substr ($sjis, 2);
}
return $uBE;
}
sub sjis2uLE
{
my ($sjis, undef) = @_;
if ($err) { return $sjis; }
my $uLE = "";
$uLE = &uBE2uLE (&sjis2uBE ($sjis));
return $uLE;
}
sub sjis2utf
{
my ($sjis, undef) = @_;
if ($err) { return $sjis; }
my $utf = "";
$sjis = &toascii ($sjis);
$utf = &_uBE2utf (&_sjis2uBE($sjis));
$utf = &tobinary ($utf);
return $utf;
}
sub uBE2sjis
{
my ($uBE, undef) = @_;
if ($err) { return $uBE; }
my $sjis = "";
$uBE = &toascii ($uBE);
$sjis = &_uBE2sjis ($uBE);
$sjis = &tobinary ($sjis);
return $sjis;
}
sub _uBE2sjis
{
my ($uBE, undef) = @_;
if ($err) { return $uBE; }
my $sjis = "";
my $code;
while ($uBE) {
$code = substr ($uBE, 0, 4);
$sjis .= $table2sjis{$code};
$uBE = substr ($uBE, 4);
}
return $sjis;
}
sub uBE2uLE
{
my ($uBE, undef) = @_;
my $uLE = "";
while ($uBE) {
$uLE .= substr ($uBE, 1, 1) . substr ($uBE, 0, 1);
$uBE = substr ($uBE, 2);
}
return $uLE;
}
sub uBE2utf
{
my ($uBE, undef) = @_;
my $utf = "";
$uBE = &toascii ($uBE);
$utf = &_uBE2utf ($uBE);
$utf = &tobinary ($utf);
return $utf;
}
sub _uBE2utf
{
my ($uBE, undef) = @_;
my $utf = "";
my $code;
while ($uBE) {
$code = substr ($uBE, 0, 4);
$buf = hex ($code);
if (0xD800 <= $buf && $buf <= 0xDFFF) { # illegal
$uBE = substr ($uBE, 4);
next;
}
if ($buf < 0x80) {
$utf .= substr ($code , 2);
}
elsif ($buf < 0x800) {
$utf .= sprintf ("%02X", (($buf & 0x7C0) >> 6) | 0xC0) . sprintf ("%02X", ($buf & 0x3F) | 0x80);
}
else {
$utf .= sprintf ("%02X", (($buf & 0xF000) >> 12) | 0xE0) . sprintf ("%02X", (($buf & 0xFC0) >> 6) | 0x80) . sprintf ("%02X", ($buf & 0x3F) | 0x80);
}
$uBE = substr ($uBE, 4);
}
return $utf;
}
sub uLE2sjis
{
my ($uLE, undef) = @_;
if ($err) { return $uLE; }
my $sjis = "";
$sjis = &uBE2sjis (&uLE2uBE ($uLE));
return $sjis;
}
sub uLE2uBE
{
my ($uLE, undef) = @_;
my $uBE = "";
$uBE = &uBE2uLE ($uLE);
return $uBE;
}
sub uLE2utf
{
my ($uLE, undef) = @_;
my $utf = "";
$utf = &uBE2utf (&uLE2uBE ($uLE));
return $utf;
}
sub utf2sjis
{
my ($utf, undef) = @_;
if ($err) { return $utf; }
my $sjis = "";
$utf = &toascii ($utf);
$sjis = &_uBE2sjis (&_utf2uBE ($utf));
$sjis = &tobinary ($sjis);
return $sjis;
}
sub utf2uBE
{
my ($utf, undef) = @_;
my $uBE = "";
$utf = &toascii ($utf);
$uBE = &_utf2uBE ($utf);
$uBE = &tobinary ($uBE);
return $uBE;
}
sub _utf2uBE
{
my ($utf, undef) = @_;
my $uBE = "";
my $code;
my $code1, $code2, $code3;
my $len;
$len = 0;
while ($utf) {
if ($len == 0) {
$code = substr ($utf, 0, 2);
$code1 = hex ($code);
if ($code1 < 0x80) {
$len = 1;
}
elsif (0xC2 <= $code1 && $code1 <= 0xDF) {
$len = 2;
}
elsif (0xE0 <= $code1) {
$len = 3;
}
else { # illegal
$len = 0;
$utf = substr ($utf, 2);
next;
}
}
if ($len == 1) { # 00 - 7F & A1 - DF
$uBE .= '00'.$code;
}
elsif ($len == 2) { # C2 80 - DF BF (0x0080 - 0x07FF)
$code2 = hex (substr ($utf, 2, 2));
if ($code2 < 0x80 || 0xBF < $code2) { # illegal
$len = 0;
$utf = substr ($utf, 4);
next;
}
$uBE .= sprintf ("%04X", (($code1 & 0x1F) << 6) | ($code2 & 0x3F));
}
elsif ($len == 3) { # E0 A0 80 - EF BF BF (0x0800 - 0xFFFF)
$code2 = hex (substr ($utf, 2, 2));
if ($code2 < 0x80 || 0xBF < $code2) { # illegal
$len = 0;
$utf = substr ($utf, 4);
next;
}
$code3 = hex (substr ($utf, 4, 2));
if ($code3 < 0x80 || 0xBF < $code3) { # illegal
$len = 0;
$utf = substr ($utf, 6);
next;
}
if (($code1 == 0xE0 && $code2 < 0xA0) || ($code1 == 0xED && 0x9F < code2)) { # illegal
$len = 0;
$utf = substr ($utf, 6);
next;
}
$uBE .= sprintf ("%04X", (($code1 & 0x0F) << 12) | (($code2 & 0x3F) << 6) | ($code3 & 0x3F));
}
else {
$len = 0;
next;
}
$utf = substr ($utf, $len * 2);
$len = 0;
}
return $uBE;
}
sub utf2uLE
{
my ($utf, undef) = @_;
my $uLE = "";
$uLE = &uBE2uLE (&utf2uBE ($utf));
return $uLE;
}
sub toascii # '\xXX' -> 'XX'
{
my ($bin, undef) = @_;
my $asc = "";
my $len = 0;
$len = length ($bin); # while ($bin) : $bin が0x30で終わっていると最後が'0'で取り出せない。
while ($len) {
$asc .= sprintf ("%02X", unpack ("C", substr ($bin, 0, 1)));
$bin = substr ($bin, 1);
$len--;
}
return $asc;
}
sub tobinary # <-> toascii ()
{
my ($asc, undef) = @_;
my $bin = "";
while ($asc) {
$bin .= pack ("C", hex (substr ($asc, 0, 2)));
$asc = substr ($asc, 2);
}
return $bin;
}
1;