zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/sim2.pl
#!/usr/bin/perl
#词词距离计算
#输入为:
#老虎
#鲜花
#月亮
#狮子
use POSIX;
#use JSON::XS;
use Clone qw(clone);
use MongoDB;
use Smart::Comments;
use lib "/home/wyb/shell/";
#use Conn_mongo_jc;
use Data::Dumper;
use IO::File;
#use Add_info;
# http://poe.perl.org/?POE_Cookbook/TCP_Servers
# Include POE and POE::Component::Server::TCP.
$hash=();
$hash_mn=();
open(FD, "word.txt");
my $tmp=();
while(<FD>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}
$line=$_;
chomp($line);
if($line=~/^W_C=(.*)/)
{
$tmp=$1;
#print $tmp,"\n";
$r=1;
}
elsif($line=~/^DEF=(.*)/)
{
$r=0;
$tmp_m=$1;
$org_m=0;
# print $tmp_m,"\n";
my @arry=split('\|',$tmp_m);
foreach $tmps (@arry)
{
# print "-----------------$tmps ------------\n";
if($tmps=~/(.*?)[\:|\}]/)
{
my $m=$1;
$org_m++;
if($m eq "专")
{next;}
# print "$tmp - $m-----\n";
if($org_m==1)
{
$hash->{$tmp}->{$m}=0.5;
$hash_mn->{$tmp}+=1;
}
else
{
$hash->{$tmp}->{$m}=0.1;
}
}
}
}
}
#- {event|事件}
# ├ {static|静态} {event|事件}
# │ ├ {relation|关系} {static|静态}
# │ │ ├ {isa|是非关系} {relation|关系}
# │ │ │ ├ {be|是} {isa|是非关系:isa={*},relevant={*}};{isa|是非关系:descriptive={*},relevant={*}}
# │ │ │ │ ├ {become|成为} {be|是:isa={*},relevant={*}};{be|是:descriptive={*},relevant={*}}
# │ │ │ │ └ {mean|指代} {be|是:isa={*},relevant={*}}
# │ │ │ └ {BeNot|非} {isa|是非关系:isa={*},relevant={*}};{isa|是非关系:descriptive={*},relevant={*}}
# │ │ ├ {possession|领属关系} {relation|关系}
$level_stat={};
$word_hash={};
$now_sub_level=0;
$olddeep=1;
open(FD_evt,"evt.txt");
while(<FD_evt>)
{
$_=~/^(.*?)\{(.*)/;
$tmpnar=$1;
$exp=$2;
# print $_;
$deep=((length($tmpnar)-2)/4);
$deep= ceil($deep); # 4
#print "$tmpnar=",length($tmpnar),"deep =$deep \n";
if($deep<1)
{$deep=1;}
$level_stat->{$deep}++;
if($olddeep> $deep)
{
for( $deep .. $olddeep)
{
$ndep=$_+1;
delete($level_stat->{$ndep});
}
}
$olddeep=$deep;
@arr_exp=split('\|',$exp);
$word=();
my $begin=0;
foreach $tmp_exp (@arr_exp)
{
if($tmp_exp=~/(.*?)[\:|\}]/)
{
$word=$1;
# print "find $word $tmp_exp\n";
$begin++;
if(exists($word_hash->{$word}))
{
#print "重复 $word\n";
while(($k,$v)=each( %{$level_stat}))
{
#print "$word is $k , $v\n";
}
}
if($begin==1)
{
$word_hash->{$word}=clone($level_stat);
# while(($k,$v)=each( %{$word_hash->{$word}}))
# {
# print "$word level $k , n $v\n";
# }
}
else
{
$word_exp_hash->{$word}=clone($level_stat);
# while(($k,$v)=each( %{$word_exp_hash->{$word}}))
# {
# print "$word exp level $k , $v\n";
# }
}
}
}
}
#print "server start\n";
#use POE qw(Component::Server::TCP);
#open(FD,">/tmp/sim_s.log");
#my $n=0;
#our $f=0;
#POE::Component::Server::TCP->new(
# Alias => "echo_server",
# Port => 11212,
# ClientInput => sub {
# my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
# print "Session ", $session->ID(), " got input: ".$input."\n";
#print "Session ", $session->ID(), " got input: ".length($input)."\n";
# $heap->{client}->put($input);
#$input="坦克,我的希望非常诱人电视里有主持人,那里有大熊猫";
# $input="网络和网民的意见,现在正行驶在通州回北京的高速公路上,十评论员单仁平的文章,标题是“做大众政治焦点,茅于轼的选择”。这篇文章的核心意见是,茅于轼应该做中国社会团结的促进者,不应该";
#$input="致力于宣传 市场万能 剥削有理 汉奸人性 保钓无用 保粮错误替富人说话的茅于轼,今天下午两点在北京海淀翠宫饭店演讲顽强继续。未知海淀区委书记隋振江,宣传部长陈名杰是否到场。外媒问我是否到场,告曰:先参加央视《苦难辉煌》座谈会,或会晚到一会儿。今天要长见识了";
# $input2="茅于轼刚刚吃完胡辣汤和烧饼夹猪头肉。现在正行驶在通州回北京的高速公路上,十五分钟以后要开始腾讯微访谈直播,我和方舟子拟就昨天的热点问题,回答网友的提问。敬请各位网友提示一下,昨天有什么热点问题需要谈一谈?";
$w1=$ARGV[0];
$w2=$ARGV[1];
$f=0;
$n=0;
$thash=();
my @input_ar;
%my_hash=txt2arr($w1,$word_exp_hash);
%my_hash2=txt2arr($w2,$word_exp_hash);
$n=comp_hash(\%my_hash,\%my_hash2);
print $n,"\n";
# }
# );
#$poe_kernel->run();
sub txt2arr
{
$w=$_[0];
$exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
$tmp_hash->{$k}=$v;
}
}
# while(($k,$v)=each(%{$tmp_hash}))
# {
#print "meaning $k $v\n";
# }
return (%{$tmp_hash});
}
#print "output = $res\n";
# $heap->{client}->stop();
# $session->stop();
# Start the server.
sub comp_hash
{
my $h1=clone(@_[0]);
my $h2=clone(@_[1]);
my $akv=0;
$k=();$v=();
my $all;
my %all;
my $vall;
my $vall2;
my $vtotal;
my $my_deep=0;
my $my_deep1=0;
my $my_deep2=0;
my $my_deep3=0;
my $my_deep4=0;
while(($k,$v)=each(%{$h1}))
{
$all->{$k}++;
$vall++;
# print " hash 1$k $v\n";
}
$k=();$v=();
while(($k,$v)=each(%{$h2}))
{
$all->{$k}++;
# print " hash 2$k $v\n";
$vall2++;
}
$k=();$v=();
while(($k,$v)=each(%{$all}))
{
if($v >= 2)
{
$akv=0.5;
$vtotal+=2;;
#$kv=($h1->{$k}+$h2->{$k})/2;
#if($h1->{$k}<$h2->{$k})
#{
#}
# print "double $k $v $vtotal\n";
# delete($h1->{$k});
# delete($h2->{$k});
# if (exists($h1->{$k}))
# {
# print "exists $k $v $kv\n";
# }
# else
# {
# print "no exists\n";
# }
}
}
if($akv>=0.5)
{
$akv=$akv-(0.12*(($vall+$vall2-$vtotal)/($vall+$vall2)));
#print "异议估算 ",0.12*(($vall+$vall2-$vtotal)/($vall+$vall2)),"\n";
}
#else
#{
# $akv=$akv-(0.1*(($vall+$vall2-$vtotal)/($vall+$vall2)));
#}
# if($akv> 0.6)
# { $akv=0.6}
$max_di=0;
$k,$v=0;
$k2,$v2=0;
$def=0;
$existk,$existexpk,$existk2,$existexpk2=0;
while(($k,$v)=each(%{$h1}))
{
$existexpk=exists ($word_exp_hash->{$k});
$existk=exists ($word_hash->{$k});
if(!($existk||$existexpk ))
{
# $def++;
# print "loop next k1-$existk-$existexpk\n";
next;}
# print "loop k1-$existk-$existexpk\n";
while(($k2,$v2)=each(%{$h2}))
{
$existk2=exists ($word_hash->{$k2});
$existexpk2=exists ($word_exp_hash->{$k2});
if(!($existk2||$existexpk2 ))
{
# $def++;
# print "loop next k2-$existk-$existexpk\n";
next;}
if (($existk)&&($existk2))
{
$my_deep1=cdeep($word_hash->{$k},$word_hash->{$k2});
if($my_deep1 > $my_deep)
{
$my_deep=$my_deep1;
#print "my_deep 1 k=$k k2=$k2 deep=$my_deep\n";
}
# else
# {$my_deep+=$my_deep1*0.1;}
#print "both 1-$k-$k2-$my_deep--------\n";
}
# if (($existexpk)&&($existexpk2))
# {
# $my_deep2=cdeep($word_exp_hash->{$k},$word_exp_hash->{$k2});
# if($my_deep2 > $my_deep)
# {
# $my_deep=$my_deep2* 0.3;
# print "my_deep 22222222222 $my_deep\n";
# }
# else
# {$my_deep+=$my_deep1*0.1;}
# print "both 2-$k-$k2---$my_deep------\n";
# }
if (($existk)&&($existexpk2))
{
$my_deep3=cdeep($word_hash->{$k},$word_exp_hash->{$k2});
$my_deep3 =$my_deep3*0.8;
if($my_deep3 > $my_deep)
{
$my_deep=$my_deep3;
#print "my_deep 333333333333333 k=$k k2=$k2 deep=$my_deep\n";
}
# else
# {$my_deep+=$my_deep1*0.01;}
#print "exp2 man 1-$k-$k2---$my_deep------\n";
}
if (($existexpk)&&($existk2))
{
$my_deep4=cdeep($word_exp_hash->{$k},$word_hash->{$k2});
$my_deep4 =$my_deep4*0.8;
if($my_deep4 > $my_deep)
{
$my_deep=$my_deep4;
#print "my_deep 444444444444444 $my_deep\n";
}
# else
# {$my_deep+=$my_deep1*0.01;}
#print "exp1 man 2-$k-$k2--$my_deep4-------\n";
}
}
}
#print "--akv-$my_deep--1 $my_deep1-2 $my_deep2-3 $my_deep3-4 $my_deep4-akv-$akv-----\n";
# if($akv>=0.5)
# {
# $akv=$akv-(0.12*(($vall+$vall2-$vtotal)/($vall+$vall2)));
#
# print "异议估算 ",0.12*(($vall+$vall2-$vtotal)/($vall+$vall2)),"\n";
# }
print "$akv+$my_deep-$def\n";
$akv=$akv+$my_deep-$def;
$akv;
}
sub cdeep
{
my $h1=clone(@_[0]);
my $h2=clone(@_[1]);
my $totle=0;
my $min=0;
my $sum=0;
my $res=0;
my $a_h1=0;
my $a_h2=0;
my $ppk;
while(($ppk,$v)=each(%{$h1}))
{
$a_h1++;
print " cdeep input 1 $ppk $v\n";
}
while(($ppk,$v)=each(%{$h2}))
{
$a_h2++;
print " cdeep input 2 $ppk $v\n";
}
if($a_h1< $a_h2)
{
$min=$a_h1;
}
else{
$min=$a_h2;}
# print "min = $min $a_h1 $a_h2\n";
for (1 .. $min)
{
my $n=$_;
if($h1->{$n} == $h2->{$n} )
{
$sum++;
# print "same $sum\n";
}
else
{
last;
}
}
#全包含
if(($sum==$a_h1)||($sum==$a_h2))
{
if($a_h1==$a_h2)
{
#全相同
$sum=3.5;
}
elsif($sum<4)
{
$sum=3.6;
}
else
#从属
{$sum=4}
}
if($sum>=6)
{$sum=4.6;}
#浅层相关
if($sum==1)
{
$res=0;
}
elsif($sum==2)
{
$res=0.01;
}
elsif($sum==3)
{
$res=0.1;
}
elsif($sum==4)
{
$res=0.18;
}
else
{
$res=1/(8-$sum);
}
# print "same level $sum $res\n";
#$res=$res*$res*$res*3;
# $res=0.9;
return $res;
}
sub ndate
{
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
$year=~s/1(.*)/20$1/;
$mon+=1;
if($mon=~/^\d$/)
{
$mon="0"."$mon";
}
if($mday=~/^\d$/)
{
$mday="0"."$mday";
}
if($hour=~/^\d$/)
{
$hour="0"."$hour";
}
if($min=~/^\d$/)
{
$min="0"."$min";
}
if($sec=~/^\d$/)
{
$sec="0"."$sec";
}
my $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
$res;
}
sub same_syna
{
@_;
foreach my $w (@_)
{
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
$tmp_hash->{$k}=$v;
}
}
}
return "$syna","$n"
}
sub same_father
{
}
sub one_of_all
{
}
sub lennovo
{
@_;
}
sub anto_sy
{
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
$tmp_hash->{$k}=$v;
}
}
}