zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/sim_word_article.pl
#!/usr/bin/perl
#use JSON::XS;
use Clone qw(clone);
use MongoDB;
use Smart::Comments;
use lib "/home/wuyabo/shell/";
use Conn_mongo_jc;
use Data::Dumper;
use IO::File;
#use Add_info;
# http://poe.perl.org/?POE_Cookbook/TCP_Servers
# Include POE and POE::Component::Server::TCP.
$hash=();
$hash_mn=();
#if ($ARGV[2]=="sy.txt")
#{
#open(FD, "sy.txt");
# our $sytmp=();
#our $tsy;
#while(<FD>)
#{
#my $line=$_;
# if($line=~/\#(.*)/)
# {
# $tsy=$1;
# if(!exists($sytmp->{$tsy}))
# {
# $sytmp->{$tsy}=$tsy;
# $eat=1;
# print "id=$tsy\n";
# }
# else
# {
# $eat=0;
# }
# }
# elsif($eat==1)
# {
# chomp($line);
# $sytmp->{$line}=$tsy;
# print "-$tsy-$line\n";
# }
#}
#}
#print $sytmp->{'自然'},"------------\n";
open(FD, "word.txt");
my $tmp=();
while(<FD>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}
$line=$_;
chomp($line);
if($line=~/^W_C=(.*)/)
{
$tmp=$1;
#print $tmp,"\n";
$r=1;
}
elsif($line=~/^DEF=(.*)/)
{
$r=0;
$tmp_m=$1;
# print $tmp_m,"\n";
my @arry=split('\|',$tmp_m);
foreach $tmps (@arry)
{
# print "-----------------$tmps ------------\n";
if($tmps=~/(.*?)\:/)
{
my $m=$1;
$r++;
# if($m eq "专")
# {next;}
# print "$tmp - $m-----\n";
$hash->{$tmp}->{$m}=1;
$hash_mn->{$tmp}+=1;
}
elsif($tmps=~/(.*?)\}/)
{
my $m=$1;
$r++;
if($r==0)
{
$hash->{$tmp}->{$m}=1;
$hash_mn->{$tmp}+=1;
}
else
{
$hash->{$tmp}->{$m}=0.5;
$hash_mn->{$tmp}+=0.5;
}
}
}
}
}
#print "server start\n";
#use POE qw(Component::Server::TCP);
#open(FD,">/tmp/sim_s.log");
#my $n=0;
#our $f=0;
#POE::Component::Server::TCP->new(
# Alias => "echo_server",
# Port => 11212,
# ClientInput => sub {
# my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
# print "Session ", $session->ID(), " got input: ".$input."\n";
#print "Session ", $session->ID(), " got input: ".length($input)."\n";
# $heap->{client}->put($input);
#$input="坦克,我的希望非常诱人电视里有主持人,那里有大熊猫";
# $input="网络和网民的意见,现在正行驶在通州回北京的高速公路上,十评论员单仁平的文章,标题是“做大众政治焦点,茅于轼的选择”。这篇文章的核心意见是,茅于轼应该做中国社会团结的促进者,不应该";
#$input="致力于宣传 市场万能 剥削有理 汉奸人性 保钓无用 保粮错误替富人说话的茅于轼,今天下午两点在北京海淀翠宫饭店演讲顽强继续。未知海淀区委书记隋振江,宣传部长陈名杰是否到场。外媒问我是否到场,告曰:先参加央视《苦难辉煌》座谈会,或会晚到一会儿。今天要长见识了";
# $input2="茅于轼刚刚吃完胡辣汤和烧饼夹猪头肉。现在正行驶在通州回北京的高速公路上,十五分钟以后要开始腾讯微访谈直播,我和方舟子拟就昨天的热点问题,回答网友的提问。敬请各位网友提示一下,昨天有什么热点问题需要谈一谈?";
$file=$ARGV[0];
$expword=$ARGV[1];
$exp_hash=();
@exp_w=split(",",$expword);
foreach $exp (@exp_w)
{
$exp_hash->{$exp}=1;
}
$f=0;
$n=0;
$thash=();
my @input_ar;
open(FD_arr,"<$file");
#open(FD_hash,">$file.txt");
while(<FD_arr>)
{
$input=$_;
$input_ar[$f]=$input;
if($f==0)
{
%my_hash=txt2arr_one($input);
#i while (($k,$v)=each(%my_hash))
# {
# print FD_hash "$k,$v\t";
# }
# print FD_hash "\n";
$d= clone(\%my_hash);
#print Data::Dumper->Dump([%{$d}]);
# %my_hash2=txt2arr($input2);
# $n=comp_hash(\%my_hash,\%my_hash2);
$org=$input;
}
elsif($f==1)
{
%my_hash2=txt2arr($input,$exp_hash);
# while (($k,$v)=each(%my_hash2))
# {
# print FD_hash "$k,$v\t";
# }
# print FD_hash "\n";
$n=comp_hash($d,\%my_hash2);
$org2=$input;
$org_n=$n;
$kk++;
$thash->{$kk}=$n;
}
else
{
%my_hash2=txt2arr($input,$exp_hash);
# while (($k,$v)=each(%my_hash2))
# {
# print FD_hash "$k,$v\t";
# }
# print FD_hash "\n";
$n=comp_hash($d,\%my_hash2);
$kk++;
$thash->{$kk}=$n;
}
$f++;
}
#close(FD_hash);
close(FD_arr);
my @key =sort {$thash->{$b} <=> $thash->{$a}} keys %{$thash};
#print @key;
foreach $i (@key)
{
$o_n++;
if($o_n >50){ last;}
if( $thash->{$i}>0)
{
print $thash->{$i},"-",$input_ar[$i];
}
}
# }
# );
#$poe_kernel->run();
sub txt2arr
{
$input_t=$_[0];
$exp_h=$_[1];
$tmp_hash=();
#print "input =$input_t\n";
$rand=rand();
open(FD_r,">/tmp/$rand");
print FD_r "$input_t\n";
close(FD_r);
$dir="/home/lzj/shell";
@res=();
@res=`$dir/parser-10.pl /tmp/$rand 0 50`;
$part=0;
unlink("/tmp/$rand");
#print @res;
foreach $tmp_res (@res)
{
chomp($tmp_res);
if ($tmp_res=~/==/)
{
$part++;
}
if (($part==1)&&($tmp_res=~/(.*?),(.*)/))
{
#print "切词1=$1,2=$2\n";
my $w=$1;
my $value=$2;
if(exists($exp_h->{$w}))
{
print "$w\n";
next;}
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
$tmp_hash->{$k}+=($value/$hash_mn->{$w});
#print "正常词 $w 转换",$tmp_hash->{$k}," $k\n";
#if(exists($sytmp->{$k}))
#{
# print "归一 $k ",$sytmp->{$k},"\n";
# $k=$sytmp->{$k};
#}
# if($value>20)
# $heap->{client}->put("$k $v\n");
}
}
else
{
$tmp_hash->{$w}=$value;
#print "no exists wordnet 切词 k=$w,v=$value\n";
}
}
elsif(($part==2)&&($tmp_res=~/(.*?),(.*)/))
{
my $w=$1;
my $value=$2;
# print "3连词 $w,$value\n";
@tmp_ar=split("-",$w);
foreach $ar_i(@tmp_ar)
{
# print "3连词 之一$ar_i,======\n";
if(exists($exp_h->{$w}))
{
print "$w\n";
next;}
if(exists($hash->{$ar_i}))
{
while(($k,$v)=each($hash->{$ar_i}))
{
#print "热词 $ar_i 转换",$tmp_hash->{$k}," $k\n";
# if(exists($sytmp->{$k}))
# {
# print "归一 $k ",$sytmp->{$k},"\n";
# $k=$sytmp->{$k};
# }
#
$tmp_hash->{$k}+=($value/($hash_mn->{$ar_i})/3);
# $heap->{client}->put("$k $v\n");
}
}
else
{
$tmp_hash->{$ar_i}+=($value/3);
#print "no exists wordnet 附加 k=$w,v=$value\n";
}
}
}
}
# while(($k,$v)=each(%{$tmp_hash}))
#
# {
# print "权重结果 $k $v\n";
# }
return (%{$tmp_hash});
}
#print "output = $res\n";
# $heap->{client}->stop();
# $session->stop();
# Start the server.
sub comp_hash
{
$h1=@_[0];
$h2=@_[1];
$akv=0;
$k=();$v=();
my $all;
my %all;
my $vall;
my $vall2;
while(($k,$v)=each(%{$h1}))
{
$all->{$k}++;
$vall+=$v;
# print " hash 1$k $v\n";
}
$k=();$v=();
while(($k,$v)=each(%{$h2}))
{
$all->{$k}++;
# print " hash 2$k $v\n";
$vall2+=$v;
}
$k=();$v=();
while(($k,$v)=each(%{$all}))
{
if($v >= 2)
{
$kv=$h1->{$k}+$h2->{$k};
#$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
#$kv=($kv+$kv2)/2;
#if($h1->{$k}<$h2->{$k})
#{
#}
# print "double $k $v $kv\n";
$akv+=$kv;
}
}
# print "---------------\n";
$akv;
}
sub ndate
{
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
$year=~s/1(.*)/20$1/;
$mon+=1;
if($mon=~/^\d$/)
{
$mon="0"."$mon";
}
if($mday=~/^\d$/)
{
$mday="0"."$mday";
}
if($hour=~/^\d$/)
{
$hour="0"."$hour";
}
if($min=~/^\d$/)
{
$min="0"."$min";
}
if($sec=~/^\d$/)
{
$sec="0"."$sec";
}
my $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
$res;
}
sub txt2arr_one
{
$w=$_[0];
$exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;
my @w_arr=split(",",$w);
foreach my $w_tmp (@w_arr)
{
if($w_tmp=~/(.*?) (.*)/)
{
$w=$1;
$vh=$2;
# print "$w -- $vh\n";
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
$tmp_hash->{$k}=$v*$vh;
}
}
}
else
{
print "input error\n";
}
}
# while(($k,$v)=each(%{$tmp_hash}))
#
# {
#print "meaning $k $v\n";
# }
# }
return (%{$tmp_hash});
}