英語の単語を原形に戻す WordNet-based lemmatizer

nltk の実装を移植する。
http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy

使う情報:

  • WordNet の ${WNHOME}/dict/*.exc 不規則変化
  • WordNet の ${WNHOME}/dict/index.* 語基
  • 品詞ごとの接尾辞ルール (上記ソースにべたがきされている)
#! /usr/bin/env ruby
# -*- coding: utf-8; mode: ruby -*-

# port from nltk.corpus.reader.wordnet.morphy
# http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy

require 'stringio'

class String
  def endwith(s)
    self =~ /#{s}$/
  end
end
class Lemmatizer
  MORPHOLOGICAL_SUBSTITUTIONS = {
    :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
                ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
               ['men', 'man'], ['ies', 'y']],
    :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
               ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
 
    :adj =>  [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
    :adv =>  []}
  
  def initialize(files=nil)
    @wordlists = {}
    @exceptions = {}
    MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
      @wordlists[x] = {}
      @exceptions[x] = {}
    end
    if files then
      files.each_pair do |pos,pair|
        load_wordnet_files(pos, pair[0], pair[1])
      end
    end
  end
  def open_file(*args)
    if args[0].is_a? IO or args[0].is_a? StringIO then
      yield args[0]
    else
      File.open(*args) do |io|
        yield io
      end
    end
  end
  def load_wordnet_files(pos, list, exc)
    open_file(list) do |io|
      io.each_line do |line|
        w = line.split(/\s+/)[0]
        @wordlists[pos][w] = w  # TODO: @wordlists and @exceptions can be merged
      end
    end
    open_file(exc) do |io|
      io.each_line do |line|
        w,s = line.split(/\s+/)
        # TODO: the ordering [w][pos] might give better performance
        @exceptions[pos][w] ||= []
        @exceptions[pos][w] << s
      end
    end
  end
  def lemma(form,pos)
    each_lemma(form,pos) do |x|
      return x
    end
    return form
  end
  def _each_substitutions(form,pos)
    if lemma = @wordlists[pos][form] then # check whether form is in wordlist
      yield lemma
    end
    MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
      old,new = *entry
      if form.endwith(old)
        _each_substitutions(form[0,form.length-old.length]+new,pos) do|x|
          yield x
        end
      end
    end
  end
  def each_lemma(form, pos)
    if lemma = @exceptions[pos][form] then # check illegular inflections
      lemma.each{|x |yield x}
    end
    if pos == :noun and form.endwith('ful') # special fix for -ful nouns
      each_lemma(form[0,form.length-3], pos) do |x|
        yield x+'ful'
      end
    else
      _each_substitutions(form,pos) do|x|
        yield x
      end
    end
  end
end

(2009-03-08T19:33:14+0900)
上記 nltk の実装は pywordnet から来ており、元々は wordnet が使っているアルゴリズムを移植したもの。
${WNHOME}/lib/morph.c
タイムスタンプによると、十年以上変わっていないらしい。

Revision 1.1 91/09/25 15:39:47