use utf8 より $KCODE='UTF8' のほうが速い
#! /usr/bin/env ruby $KCODE='UTF8' require 'moji' DAKUON = Hash[*('ウヴカガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバヒビフブヘベホボ'.split //)] HANDAK = Hash[*('ハパヒピフプヘペホポ'.split //)] DAKUON_ORIGIN = DAKUON.keys.join HANDAK_ORIGIN = HANDAK.keys.join LATIN_ORIGIN0 = (0x01..0xFF).map{|x| x.chr}.join LATIN_ORIGIN = LATIN_ORIGIN0.sub(/\\/, '\\\\\\\\').sub(/-/, '\\-') LATIN_ZENKAKU = Moji.han_to_zen(LATIN_ORIGIN0) KATAKANA_ZENKAKU = 'ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜' KATAKANA_HANKAKU = Moji.zen_to_han(KATAKANA_HANKAKU) while gets $_.tr!(KATAKANA_HANKAKU, KATAKANA_ZENKAKU) $_.gsub! /[(#{DAKUON_ORIGIN})]゛/, "#{DAKUON[$1]}" $_.gsub! /[(#{HANDAK_ORIGIN})]゜/, "#{HANDAK[$1]}" $_.tr! ' ', ' ' $_.tr!(LATIN_ORIGIN, LATIN_ZENKAKU) print $_ end
#! /usr/bin/env perl use utf8; use Readonly; use Encode; use Unicode::Japanese qw[unijp]; Readonly my %dakuon => map {$_} split //, 'ウヴカガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバヒビフブヘベホボ'; Readonly my %handakuon => map {$_} split //, 'ハパヒピフプヘペホポ'; Readonly my $dakuon_origin => join q{}, keys %dakuon; Readonly my $handakuon_origin => join q{}, keys %handakuon; Readonly my $keywordlist_encoding => q{utf-8}; while(<>){ $_ = decode_utf8 $_; chomp; trr/ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜/ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜/; s/([$dakuon_origin])゛/$dakuon{$1}/g; s/([$handakuon_origin])゜/$handakuon{$1}/g; tr/ \-~/ −〜/; tr[\x{01}-\x{FF}][\x{FEE1}-\x{FFDF}]; print encode_utf8 $_; print "\n"; }
ruby 1.8.5 (2006-08-25)
perl, v5.8.8
ruby t.rb < <(head -n1000 x.txt) > /dev/null 0.20s user 0.00s system 82% cpu 0.251 total
perl t.pl < <(head -n1000 x.txt) > /dev/null 0.88s user 0.02s system 95% cpu 0.947 total
まとめ
- utf8フラグを立てたら負け
- オブジェクトを作ったら負け