XHTML を XML に逆変換
#! /usr/bin/env perl use warnings; use strict; use WWW::Mechanize; use WWW::Mechanize::Link; use URI::URL; use URI::file; use URI::Escape; use Getopt::Long; use Pod::Usage; use XML::Simple; use Data::Dumper; use Encode; use encoding qw/utf8/; use open OUT => ':utf8'; use open qw/:std/; $XML::Simple::PREFERRED_PARSER = 'XML::Parser'; sub xml_in_decoding($$) { my($xml_simple, $http_response) = @_; my $content = $http_response->content; if ( join(' ',$http_response->headers->content_type) =~ m/charset=(\w+)/ ) { $content = decode($1, $content); } $content =~ s{"Shift_JIS"}{"UTF-8"}; #tidy { my $tmp = '/tmp/tidyresult'; { open my $f, '>:encoding(utf-8)', $tmp or die "$!: $tmp"; print $f $content; close $f; } system qq{tidy -modify -utf8 $tmp 2>/dev/null}; { open my $f, '<:encoding(utf-8)', $tmp or die "$!: $tmp"; $content = join '', <$f>; close $f; } } return $xml_simple->xml_in($content); } # get arguments my $wait_seconds = 3; my $verbose = 0; my $input_encoding = ''; GetOptions( 'wait=i' => \$wait_seconds, 'verbose' => \$verbose, 'encoding=s' => \$input_encoding, 'help' => sub{pod2usage(0)} ); my $url = shift @ARGV; my $m = WWW::Mechanize->new(); my $xs = XML::Simple->new(ForceArray => 1, KeyAttr => []); # get a village my %village; $m->get($url); # assuming this as prologue exit 1 unless ( $m->success ); { my @a; foreach my $link ( $m->find_all_links( url_regex => qr{_progress_\d+} ) ) { $link->url =~ m/_progress_(\d+)/; my $n = $1; next if defined $a[$n]; print STDERR "waiting $wait_seconds seconds before retrieving ".$link->url."...\n" if $verbose; sleep $wait_seconds; $m->follow_link(url => $link->url); $a[$n] = xml_in_decoding($xs, $m->response); } $village{progresses} = \@a; } print STDERR "waiting $wait_seconds seconds before retrieving 'party'...\n" if $verbose; if ( $m->follow_link( url => q{_party_} ) ) { sleep $wait_seconds; $village{epilogue} = $m->ct; } print $xs->xml_out(\%village); #my $log = Parse::RecDescent->new($grammar)->parse($text); #print Dumper($log); __END__ =head1 NAME ninjin_crawl.pl - Jinro BBS crawler =head1 SYNOPSIS crawler.pl [options] URL =head1 OPTIONS --encoding URL auto-detect if not specified --help shows this help --wait waiting time between requests [3] =head1 DESCRIPTION ninjin_crawl.pl retrieves and parse a whole log of the villlage specified in the argument. And put the result to the standard output as a XML. =head1 SEE ALSO L<WWW::Mechanize> =cut