XHTML を XML に逆変換

#! /usr/bin/env perl
use warnings;
use strict;
use WWW::Mechanize;
use WWW::Mechanize::Link;
use URI::URL;
use URI::file;
use URI::Escape;
use Getopt::Long;
use Pod::Usage;
use XML::Simple;
use Data::Dumper;
use Encode;
use encoding qw/utf8/;
use open OUT => ':utf8';
use open qw/:std/;
$XML::Simple::PREFERRED_PARSER = 'XML::Parser';

sub xml_in_decoding($$) {
  my($xml_simple, $http_response) = @_;
  my $content = $http_response->content;
  
  if ( join(' ',$http_response->headers->content_type) =~ m/charset=(\w+)/ ) {
    $content = decode($1, $content);
  }
  $content =~ s{"Shift_JIS"}{"UTF-8"};

  #tidy
  {
    my $tmp = '/tmp/tidyresult';
    {
      open my $f, '>:encoding(utf-8)', $tmp or die "$!: $tmp";
      print $f $content;
      close $f;
    }
    system qq{tidy -modify -utf8 $tmp 2>/dev/null};
    {
      open my $f, '<:encoding(utf-8)', $tmp or die "$!: $tmp";
      $content = join '', <$f>;
      close $f;
    }
  }
  return $xml_simple->xml_in($content);
}

# get arguments
my $wait_seconds = 3;
my $verbose = 0;
my $input_encoding = '';
GetOptions(
           'wait=i'     => \$wait_seconds,
           'verbose'    => \$verbose,
           'encoding=s' => \$input_encoding,
           'help'       => sub{pod2usage(0)}
          );
my $url = shift @ARGV;

my $m = WWW::Mechanize->new();
my $xs = XML::Simple->new(ForceArray => 1,
                          KeyAttr => []);

# get a village
my %village;
$m->get($url); # assuming this as prologue
exit 1  unless ( $m->success );
{
  my @a;
  foreach my $link ( $m->find_all_links( url_regex => qr{_progress_\d+} ) ) {
    $link->url =~ m/_progress_(\d+)/;
    my $n = $1;
    next if defined $a[$n];
    print STDERR "waiting $wait_seconds seconds before retrieving ".$link->url."...\n" if $verbose;
    sleep $wait_seconds;
    $m->follow_link(url => $link->url);
    $a[$n] = xml_in_decoding($xs, $m->response);
  }
  $village{progresses} = \@a;
}

print STDERR "waiting $wait_seconds seconds before retrieving 'party'...\n" if $verbose;
if ( $m->follow_link( url => q{_party_} ) ) {
  sleep $wait_seconds;
  $village{epilogue} = $m->ct;
}
print $xs->xml_out(\%village);

#my $log = Parse::RecDescent->new($grammar)->parse($text);
#print Dumper($log);

__END__

=head1 NAME

  ninjin_crawl.pl - Jinro BBS crawler

=head1 SYNOPSIS

  crawler.pl [options] URL

=head1 OPTIONS

  --encoding      URL
                  auto-detect if not specified
  --help          shows this help
  --wait          waiting time between requests [3]

=head1 DESCRIPTION

  ninjin_crawl.pl retrieves and parse a whole log
  of the villlage specified in the argument.
  And put the result to the standard output as a XML.

=head1 SEE ALSO

L<WWW::Mechanize>

=cut