package unicodeconv; # unicodeconv.pl - Unicode , UTF-8 convert to each other and to ShiftJIS code # Jcode.pm使えという突っ込みは無しの方向で。(ぉ # BigEndian -> BE, LittleEndian -> LE と省略しています。 # Byte Order Mark(BOM) は考慮していません。(というか含まれているとたぶん誤作動します。 # 変換テーブルに使われているのはWindows用の物です。適宜置き換えてください。(ですが、他のテーブルでの変換は未確認です。) # コーディングに際して、以下のサイトに非常にお世話になりました。こんな場所でですが厚く御礼申し上げます。 # http://homepage3.nifty.com/hippo2000/ # http://homepage1.nifty.com/nomenclator/unicode/ # http://www.debian.or.jp/~kubota/ # そして、何度も初歩的な質問に答えて頂いた、WinMX上のチャット部屋の「初心者の部屋@最低人間」のShimizさん、mozさん、最低人間さん、 # 常連の皆様方に感謝いたします。 # このコードは未保証です。ある程度はテストしていますがバグがあるかもしれません。 # このコードの私に著作権が帰属する部分についての利用に関して、私が持つ権利を全て放棄します。 # インターフェース # ○文字コードを変換します。 # ex. $str = &unicodeconv::utf2sjis ($buf); # UTF-8、UnicodeBE、UnicodeLE、ShiftJISを相互に変換出来ます。 # sjis2uBE sjis2uLE sjis2utf ShiftSJISからの変換。 # uBE2sjis uBE2uLE uBE2utf UnicodeBEからの変換。 # uLE2sjis uLE2uBE uLE2utf UnicodeLEからの変換。 # utf2sjis utf2uBE utf2uLE UTF-8からの変換。 # ○文字列を16進数表示の数字列に、或いはその逆を行います。 # ex. $ascii = &unicodeconv::toascii ($str); # toascii 'ABCDEFG' -> '41424344454647' # tobinary '41424344454647' -> 'ABCDEFG' my $ver = '0.1'; my %table2sjis; # UnicodeBE -> ShiftJIS convert table my %table2uBE; # ShiftJIS -> UnicodeBE convert table unless ($unicodeconvver) { # UnicodeBE <-> SJIS Table my $tabletxt = './lib/CP932.TXT'; # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT my $sjis, $uBE; my $err = 0; open (TABLE, "<$tabletxt") or die "error while opening $tabletxt"; while () { if (/^#/) { next; } ($sjis, $uBE) = split; $sjis = substr ($sjis, 2); $uBE = substr ($uBE, 2); if ($table2sjis{$uBE}) { next; } # 外字で再定義されているのを無視 $table2sjis{$uBE} = $sjis; $table2uBE{$sjis} = $uBE; } unless ($err) { $unicodeconvver = "unicodeconv.pl, ver $ver"; } } sub sjis2uBE { my ($sjis, undef) = @_; if ($err) { return $sjis; } my $uBE = ""; $sjis = &toascii ($sjis); $uBE = &_sjis2uBE ($sjis); $uBE = &tobinary ($uBE); return $uBE; } sub _sjis2uBE { my ($sjis, undef) = @_; if ($err) { return $sjis; } my $uBE = ""; my $code; my $buf; while ($sjis) { $code = substr ($sjis, 0, 2); $buf = hex ($code); if ((0x80 < $buf && $buf < 0xA0) || 0xDF < $buf) { $sjis = substr ($sjis, 2); $code .= substr ($sjis, 0, 2); } $uBE .= $table2uBE{$code}; $sjis = substr ($sjis, 2); } return $uBE; } sub sjis2uLE { my ($sjis, undef) = @_; if ($err) { return $sjis; } my $uLE = ""; $uLE = &uBE2uLE (&sjis2uBE ($sjis)); return $uLE; } sub sjis2utf { my ($sjis, undef) = @_; if ($err) { return $sjis; } my $utf = ""; $sjis = &toascii ($sjis); $utf = &_uBE2utf (&_sjis2uBE($sjis)); $utf = &tobinary ($utf); return $utf; } sub uBE2sjis { my ($uBE, undef) = @_; if ($err) { return $uBE; } my $sjis = ""; $uBE = &toascii ($uBE); $sjis = &_uBE2sjis ($uBE); $sjis = &tobinary ($sjis); return $sjis; } sub _uBE2sjis { my ($uBE, undef) = @_; if ($err) { return $uBE; } my $sjis = ""; my $code; while ($uBE) { $code = substr ($uBE, 0, 4); $sjis .= $table2sjis{$code}; $uBE = substr ($uBE, 4); } return $sjis; } sub uBE2uLE { my ($uBE, undef) = @_; my $uLE = ""; while ($uBE) { $uLE .= substr ($uBE, 1, 1) . substr ($uBE, 0, 1); $uBE = substr ($uBE, 2); } return $uLE; } sub uBE2utf { my ($uBE, undef) = @_; my $utf = ""; $uBE = &toascii ($uBE); $utf = &_uBE2utf ($uBE); $utf = &tobinary ($utf); return $utf; } sub _uBE2utf { my ($uBE, undef) = @_; my $utf = ""; my $code; while ($uBE) { $code = substr ($uBE, 0, 4); $buf = hex ($code); if (0xD800 <= $buf && $buf <= 0xDFFF) { # illegal $uBE = substr ($uBE, 4); next; } if ($buf < 0x80) { $utf .= substr ($code , 2); } elsif ($buf < 0x800) { $utf .= sprintf ("%02X", (($buf & 0x7C0) >> 6) | 0xC0) . sprintf ("%02X", ($buf & 0x3F) | 0x80); } else { $utf .= sprintf ("%02X", (($buf & 0xF000) >> 12) | 0xE0) . sprintf ("%02X", (($buf & 0xFC0) >> 6) | 0x80) . sprintf ("%02X", ($buf & 0x3F) | 0x80); } $uBE = substr ($uBE, 4); } return $utf; } sub uLE2sjis { my ($uLE, undef) = @_; if ($err) { return $uLE; } my $sjis = ""; $sjis = &uBE2sjis (&uLE2uBE ($uLE)); return $sjis; } sub uLE2uBE { my ($uLE, undef) = @_; my $uBE = ""; $uBE = &uBE2uLE ($uLE); return $uBE; } sub uLE2utf { my ($uLE, undef) = @_; my $utf = ""; $utf = &uBE2utf (&uLE2uBE ($uLE)); return $utf; } sub utf2sjis { my ($utf, undef) = @_; if ($err) { return $utf; } my $sjis = ""; $utf = &toascii ($utf); $sjis = &_uBE2sjis (&_utf2uBE ($utf)); $sjis = &tobinary ($sjis); return $sjis; } sub utf2uBE { my ($utf, undef) = @_; my $uBE = ""; $utf = &toascii ($utf); $uBE = &_utf2uBE ($utf); $uBE = &tobinary ($uBE); return $uBE; } sub _utf2uBE { my ($utf, undef) = @_; my $uBE = ""; my $code; my $code1, $code2, $code3; my $len; $len = 0; while ($utf) { if ($len == 0) { $code = substr ($utf, 0, 2); $code1 = hex ($code); if ($code1 < 0x80) { $len = 1; } elsif (0xC2 <= $code1 && $code1 <= 0xDF) { $len = 2; } elsif (0xE0 <= $code1) { $len = 3; } else { # illegal $len = 0; $utf = substr ($utf, 2); next; } } if ($len == 1) { # 00 - 7F & A1 - DF $uBE .= '00'.$code; } elsif ($len == 2) { # C2 80 - DF BF (0x0080 - 0x07FF) $code2 = hex (substr ($utf, 2, 2)); if ($code2 < 0x80 || 0xBF < $code2) { # illegal $len = 0; $utf = substr ($utf, 4); next; } $uBE .= sprintf ("%04X", (($code1 & 0x1F) << 6) | ($code2 & 0x3F)); } elsif ($len == 3) { # E0 A0 80 - EF BF BF (0x0800 - 0xFFFF) $code2 = hex (substr ($utf, 2, 2)); if ($code2 < 0x80 || 0xBF < $code2) { # illegal $len = 0; $utf = substr ($utf, 4); next; } $code3 = hex (substr ($utf, 4, 2)); if ($code3 < 0x80 || 0xBF < $code3) { # illegal $len = 0; $utf = substr ($utf, 6); next; } if (($code1 == 0xE0 && $code2 < 0xA0) || ($code1 == 0xED && 0x9F < code2)) { # illegal $len = 0; $utf = substr ($utf, 6); next; } $uBE .= sprintf ("%04X", (($code1 & 0x0F) << 12) | (($code2 & 0x3F) << 6) | ($code3 & 0x3F)); } else { $len = 0; next; } $utf = substr ($utf, $len * 2); $len = 0; } return $uBE; } sub utf2uLE { my ($utf, undef) = @_; my $uLE = ""; $uLE = &uBE2uLE (&utf2uBE ($utf)); return $uLE; } sub toascii # '\xXX' -> 'XX' { my ($bin, undef) = @_; my $asc = ""; my $len = 0; $len = length ($bin); # while ($bin) : $bin が0x30で終わっていると最後が'0'で取り出せない。 while ($len) { $asc .= sprintf ("%02X", unpack ("C", substr ($bin, 0, 1))); $bin = substr ($bin, 1); $len--; } return $asc; } sub tobinary # <-> toascii () { my ($asc, undef) = @_; my $bin = ""; while ($asc) { $bin .= pack ("C", hex (substr ($asc, 0, 2))); $asc = substr ($asc, 2); } return $bin; } 1;