#!/usr/bin/perl # Copyright (c) 1998, 1999 Bruce Martin # Usage: # 1. Set $old_path and $new_path appropriately. # 2. Comment or uncomment &debug. # 3. Run map > map.out require 5.000; # For hard references. use Search::Dict; # For binary search. # Decrease the cache size if you run out of memory. $cache_size = 100000; # Define the paths to the wordnet files. $pos = "verb"; $old_path = "/home/bruce/proj/wn/wordnet/dict"; $old_data = "$old_path/data.$pos"; $new_path = "/home/bruce/proj/wn/wordnet-1.6/dict"; $new_data = "$new_path/data.$pos"; $new_index = "$new_path/index.$pos"; # Open the files for random access. open( OLD_DATA, $old_data ) or die "Can't open $old_data"; open( NEW_DATA, $new_data ) or die "Can't open $new_data"; open( NEW_INDEX, $new_index ) or die "Can't open $new_index"; # Loop through the old synsets sequentially. @ARGV = ( $old_data ); while( <> ) { next unless /^(\d{8})/; $old_synset = &cache( "synset", OLD_DATA, $1 ); $new_synset = &best_match( $old_synset ); print "$old_synset->{id}\t$new_synset->{id}\n"; # &debug; } ############################################################################### ##### Call the given subroutine or return its cached results. sub cache { my( $sub, $file, $key ) = @_; if( defined $cache{ $sub, $file, $key } ) { # Return the cached result. $cache{ $sub, $file, $key }; } else { # Delete an entry if the cache is full. delete $cache{ each %cache } if $cache_size-- < 0; # Call the subroutine and cache its result. $cache{ $sub, $file, $key } = &$sub( $file, $key ); } } ##### Read and parse a synset from a data file. sub synset { my( $data, $id ) = @_; my( $hex, $count, $synonyms, @synonyms, @parents, $gloss ); # Read the synset from the data file. seek( $data, $id, 0 ); $_ = <$data>; # Parse the synonyms. ( $hex ) = /^.{13} (\w\w)/; $count = hex $hex; ( $synonyms ) = /^.{16} ((\S+ \S+ ){$count})/; @synonyms = split / \S+ /, $synonyms; # Parse the parents. @parents = /@ (\d{8}) . \d{4} /g; # Parse the gloss. ($gloss) = /\|\s*(.*)/; # Return a reference to a hash of the synset. +{ id, $id, synonyms, \@synonyms, parents, \@parents, gloss, $gloss }; } ##### Find the synsets a word belongs to. sub synset_ids { my( $index, $word ) = @_; my( @ids ); # Look up the word in the index. look *$index, $word, 0, 1; $_ = <$index>; # If it's found, return a reference to an array of its synsets. if( /^$word /i ) { @ids = / (\d{8})/g; \@ids; } } ##### Find the best match for an old synset. sub best_match { my( $old_synset ) = @_; my( $similarity, $best_similarity, $best_match ); # Loop through the potential matches. foreach $new_synset_id ( &new_synset_ids( $old_synset ) ) { $new_synset = &cache( "synset", NEW_DATA, $new_synset_id ); # Calculate the similarity of the old and new synsets. $similarity = &synonym_similarity( $old_synset, $new_synset ) + &gloss_similarity( $old_synset, $new_synset ) + &parent_similarity( $old_synset, $new_synset ) + &id_similarity( $old_synset, $new_synset ); # Store the best match. if( $similarity > $best_similarity ) { $best_similarity = $similarity; $best_match = $new_synset; } } $best_match; } ##### List the new synsets that might match the old synset. sub new_synset_ids { my( $old_synset ) = @_; my( %new_synset_ids ); # Loop through the old synonyms. foreach $word ( @{$old_synset->{synonyms}} ) { # Look up the new synsets for each old synonym. foreach $new_synset_id ( @{&cache( "synset_ids", NEW_INDEX, $word )} ) { $new_synset_ids{ $new_synset_id }++; } } ( keys %new_synset_ids ); } ##### Calculate the similarity of the synonyms of two synsets. sub synonym_similarity { my( $old_synset, $new_synset ) = @_; my( @old_words, @new_words, %words, $similarity ); # Make a hash of the old synonyms. @old_words = @{$old_synset->{synonyms}}; foreach $word ( @old_words ) { $words{ $word }++; } # Count the number of new synonyms in the hash. @new_words = @{$new_synset->{synonyms}}; foreach $word ( @new_words ) { $similarity++ if $words{ $word }--; } # Return the number of matches / the highest number possible. $similarity / &max( scalar @old_words, scalar @new_words ); } ##### Calculate the similarity of the gloss in two synsets. sub gloss_similarity { my( $old_synset, $new_synset ) = @_; my( @old_words, @new_words, %words, $similarity ); # Make a hash of the old gloss words (and synonyms because some old # synonyms were moved to new gloss). @old_words = split /[\W_]+/, "@{$old_synset->{synonyms}} $old_synset->{gloss}"; foreach $word ( @old_words ) { $words{ $word }++; } # Count the number of new gloss words in the hash. @new_words = split /[\W_]+/, "@{$new_synset->{synonyms}} $new_synset->{gloss}"; foreach $word ( @new_words ) { $similarity++ if $words{ $word }--; } # Return the number of matches / the highest number possible. # 1.5 * $similarity / &max( scalar @old_words, scalar @new_words ); $similarity / &max( scalar @old_words, scalar @new_words ); } ##### Return the largest argument. sub max { (sort { $b <=> $a } @_)[0]; } ##### Calculate the similarity of the parents of two synsets. sub parent_similarity { my( $old_synset, $new_synset ) = @_; my( $old_parent, $new_parent, $similarity ); # Look up each old parent and new parent. foreach $old_parent_id ( @{$old_synset->{parents}} ) { $old_parent = &cache( "synset", OLD_DATA, $old_parent_id ); foreach $new_parent_id ( @{$new_synset->{parents}} ) { $new_parent = &cache( "synset", NEW_DATA, $new_parent_id ); # Sum their similarities. $similarity += &synonym_similarity( $old_parent, $new_parent ) + &gloss_similarity( $old_parent, $new_parent ) + &id_similarity( $old_parent, $new_parent ); } } # Return their similarity, or 1 if neither has parents, or 0 if only one # has parents. if( @{$old_synset->{parents}} ) { $similarity / 2 / @{$old_synset->{parents}}; } elsif( !@{$new_synset->{parents}} ) { 1; } } ##### Calculate the similarity of the ids of two synsets. sub id_similarity { my( $old_synset, $new_synset ) = @_; # Store the ratio of the file sizes. unless( defined $id_scale ) { $id_scale = (-s $old_data) / (-s $new_data); $old_start = 19262; $new_start = 20681; } # Return the similarity of the relative file positions of the synsets. 500 / ( abs( ($old_synset->{id} - $old_start) - ($new_synset->{id} - $new_start) * $id_scale ) + 1000 ); } ##### Print debugging output. sub debug { $isim = sprintf "%5.3f", &id_similarity( $old_synset, $new_synset ); $ssim = sprintf "%5.3f",&synonym_similarity( $old_synset, $new_synset ); $gsim = sprintf "%5.3f", &gloss_similarity( $old_synset, $new_synset ); $psim = sprintf "%5.3f", &parent_similarity( $old_synset, $new_synset ); print <{id} syns @{$old_synset->{synonyms}} gloss $old_synset->{gloss} parents @{$old_synset->{parents}} $isim $new_synset->{id} $ssim @{$new_synset->{synonyms}} $gsim $new_synset->{gloss} $psim @{$new_synset->{parents}} ############################################################################### END }