use utf8 より $KCODE='UTF8' のほうが速い

#! /usr/bin/env ruby
$KCODE='UTF8'
require 'moji'

DAKUON = Hash[*('ウヴカガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバヒビフブヘベホボ'.split //)]
HANDAK = Hash[*('ハパヒピフプヘペホポ'.split //)]
DAKUON_ORIGIN = DAKUON.keys.join
HANDAK_ORIGIN = HANDAK.keys.join
LATIN_ORIGIN0 = (0x01..0xFF).map{|x| x.chr}.join
LATIN_ORIGIN  = LATIN_ORIGIN0.sub(/\\/, '\\\\\\\\').sub(/-/, '\\-')
LATIN_ZENKAKU = Moji.han_to_zen(LATIN_ORIGIN0)
KATAKANA_ZENKAKU = 'ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜'
KATAKANA_HANKAKU = Moji.zen_to_han(KATAKANA_HANKAKU)

while gets
  $_.tr!(KATAKANA_HANKAKU, KATAKANA_ZENKAKU)
  $_.gsub! /[(#{DAKUON_ORIGIN})]゛/, "#{DAKUON[$1]}"
  $_.gsub! /[(#{HANDAK_ORIGIN})]゜/, "#{HANDAK[$1]}"
  $_.tr! ' ', ' '
  $_.tr!(LATIN_ORIGIN, LATIN_ZENKAKU)
  print $_
end
#! /usr/bin/env perl
use utf8;
use Readonly;
use Encode;
use Unicode::Japanese qw[unijp];
Readonly my %dakuon => map {$_} split //, 'ウヴカガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバヒビフブヘベホボ';
Readonly my %handakuon => map {$_} split //, 'ハパヒピフプヘペホポ';
Readonly my $dakuon_origin => join q{}, keys %dakuon;
Readonly my $handakuon_origin => join q{}, keys %handakuon;
Readonly my $keywordlist_encoding => q{utf-8};

while(<>){
  $_ = decode_utf8 $_;
  chomp;
  trr/ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜/ァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン゛゜/;
  s/([$dakuon_origin])゛/$dakuon{$1}/g;
  s/([$handakuon_origin])゜/$handakuon{$1}/g;
  tr/ \-~/ −〜/;
  tr[\x{01}-\x{FF}][\x{FEE1}-\x{FFDF}];
  print encode_utf8 $_;
  print "\n";
}

ruby 1.8.5 (2006-08-25)
perl, v5.8.8

ruby t.rb < <(head -n1000 x.txt) > /dev/null 0.20s user 0.00s system 82% cpu 0.251 total
perl t.pl < <(head -n1000 x.txt) > /dev/null 0.88s user 0.02s system 95% cpu 0.947 total

まとめ

  • utf8フラグを立てたら負け
  • オブジェクトを作ったら負け