zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/engine_dis.pl
#!/usr/bin/perl
use Encode;
#use JSON::XS;
use Clone qw(clone);
use MongoDB;
use Smart::Comments;
use lib "/home/wuyabo/shell/";
use Conn_mongo_jc;
use Data::Dumper;
use IO::File;
#use Add_info;
# http://poe.perl.org/?POE_Cookbook/TCP_Servers
# Include POE and POE::Component::Server::TCP.
our $hash=();
$hash_mn=();
my $hash_sc=();
use POE qw(Component::Server::TCP);
my $w_c_v;
all_word_id2();
word_class_value();
exit;
word_class_value2();
sub all_word_id
{
my $col = Conn_mongo_jc->new("dic", "sogoudic_c");
my $news=$col->find();
my $id,$word;
my $nnn=0;
my $nn2=0;
open(FDdic,">allword.txt");
while(my $one =$news->next)
{
$id=$one->{'id'};
$word=$one->{'word'};
$word=encode("utf-8",$word);
$hash->{$id}=$word;
print FDdic $hash->{$id},"\t",$id,"\n";
# $nnn++;
# if($nnn ==10000)
# {$nn2++;print "$nn2\n";$nnn=0;}
}
close(FDdic);
print "loading dic ok $word $id\n";
}
sub word_class_value2
{
my $nnn=0;
my $nn2=0;
open(FDdic,"log");
while(<FDdic>)
{
my $line=$_;
if($line=~/(.*?)\t(.*?)\t(.*)/)
{
$w_c_v->{$1}->{$2}=$3;
#print " $1 $2 $3\n";
}
}
close(FDdic);
}
sub all_word_id2
{
open(FD,"allword.txt");
while(<FD>)
{
$_=~/(.*?)\t(.*)/;
$hash->{$2}=$1;
}
close FD;
}
sub word_class_value
{
my $col = Conn_mongo_jc->new("classifier", "classifiers");
my $some_dir="./class";
opendir(my $dh, $some_dir) || die;
while(readdir $dh) {
my $dir=$_;
if(length($dir)<3)
{next;}
my $id2 = MongoDB::OID->new(value=>$dir);
my $site=$col->find_one({"_id"=> $id2});
my $name=$site->{'name'};
$name=encode('utf-8',$name);
print $name,"name $dir \n";
while(($k,$v)=each(%{$site->{'classes'}}))
{
# while(($k1,$v1)=each($k))
# {
my $cn=encode('utf-8',$v->{'name'});
my $cnid=$v->{'id'};
print "$k ---- ",$cn, $cnid,"\n";
# }
$hash_c_i->{$name}->{$cn}->{$cnid}=0;
}
my $has1=0;
my $has2=0;
my $start=0;
my $ok=0;
if((-e "./class/$dir/__MAXENT_PARAMS.model")&&(-e "./class/$dir/__MAXENT_MAPPING.model"))
{
my $ok=1;
#open(FD,"./class/$dir/__MAXENT_PARAMS.model");
my @clm=`cat ./class/$dir/__MAXENT_MAPPING.model`;
my $ct1,$ct2;
my $w_v;
foreach my $tpl(@clm)
{
chomp($tpl);
my $ltpl=length($tpl);
# print "tpl $tpl ",$ltpl,"\n";
if($tpl=~/^(\d+) (\d)$/)
{
my $y_id=$1;
my $tid=$2;
if(exists($hash_c_i->{$name}->{"是"}->{$y_id}))
{
print "exists $name $y_id $tid $dir 是\n";
$has1=1;
}
elsif(exists($hash_c_i->{$name}->{"否"}->{$y_id}))
{
$has2=1;
print "exists $name $y_id $tid $dir 否\n";}
elsif(exists($hash_c_i->{$name}->{"负面"})){$has1=1;$has2=1;}
else {print "error $cnid $2 $name $y_id $dir\n";$ok=0;;}
}
#else{print length($tpl),"length\n";}
elsif( $ltpl==1 )
{
print "find \n";
if(($has1==1)&&($has2==1))
{$start=1;}
else{last;}
}
elsif($start==1)
{
my $wid,$twid;
if($tpl=~/^(\d+)\:(\d+) (\d+)$/)
{
$wid=$1;
$twid=$3;
$w_v->{$twid}=$wid;
#print "$wid $twid\n";
}
else{print "error map data $tpl\n";exit;}
}
else
{last;}
}
if($ok==0){next;}
my @clm=`cat ./class/$dir/__MAXENT_PARAMS.model`;
my $ct1,$ct2;
foreach my $tpl(@clm)
{
if($tpl=~/^(\d+?) (\d) (.*)$/)
{
my $v1=$1;
my $v2=$2;
my $v3=$3;
if(exists($w_v->{$v1}))
{
$tmpid=$w_v->{$v1};
my $add=$v2;
my $ccv=$v3;
if($add==2)
{$ccv=-$ccv;}
if(exists($hash->{$tmpid}))
{
my $orgw=$hash->{$tmpid};
print "$orgw $name $ccv\n";
$w_c_v->{$orgw}->{$name}=$ccv;;
}
else
{print "error no id $tmpid\n";exit;}
}
else
{print "error no match mapping $v1 $v2 $v3\n";
print Data::Dumper->Dump([%{$w_v}]);
exit;}
}
}
}
else{print "error no module $dir $name\n";}
#l __MAXENT_PARAMS.model__MAXENT_MAPPING.model
}
closedir $dh;
}
my $n=0;
#our $f=0;
print "server start\n";
POE::Component::Server::TCP->new(
Alias => "echo_server",
Port => 11212,
ClientInput => sub {
my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
#print "Session ", $session->ID(), " got input: ".$input."\n";
#print "Session ", $session->ID(), " got input: ".length($input)."\n";
my $d;
my $randout;
$randout=rand();
open(FD_log,">/tmp/$randout.log");
#print FD_log "$input\n";
#system("cp $input /tmp/");
#print "input file $input ",length($input),"\n";
my @res=search_article($input);
#print FD_log @res;
#close(FD_log);
#print scalar(@res)," num\n";
$heap->{client}->put("/tmp/$randout.log\n");
#$session->yield("shutdown");
$_[KERNEL]->yield("shutdown");
return;
},
ClientDisconnected => sub {
#print "Client disconnected\n"; # log it
}
#, ClientFlushed => sub {
# my $data_source = $_[HEAP]{file_handle};
# my $read_count = sysread($data_source, my $buffer = "", 65536);
# if ($read_count) {
# $_[HEAP]{client}->put($buffer);
# }
# else {
# print FD_log "------------------error\n";
# $_[KERNEL]->yield("shutdown");
# }
#}
);
POE::Kernel->run;
#$input="坦克,我的希望非常诱人电视里有主持人,那里有大熊猫";
# $input="网络和网民的意见,现在正行驶在通州回北京的高速公路上,十评论员单仁平的文章,标题是“做大众政治焦点,茅于轼的选择”。这篇文章的核心意见是,茅于轼应该做中国社会团结的促进者,不应该";
#$input="致力于宣传 市场万能 剥削有理 汉奸人性 保钓无用 保粮错误替富人说话的茅于轼,今天下午两点在北京海淀翠宫饭店演讲顽强继续。未知海淀区委书记隋振江,宣传部长陈名杰是否到场。外媒问我是否到场,告曰:先参加央视《苦难辉煌》座谈会,或会晚到一会儿。今天要长见识了";
# $input2="茅于轼刚刚吃完胡辣汤和烧饼夹猪头肉。现在正行驶在通州回北京的高速公路上,十五分钟以后要开始腾讯微访谈直播,我和方舟子拟就昨天的热点问题,回答网友的提问。敬请各位网友提示一下,昨天有什么热点问题需要谈一谈?";
#$file=$ARGV[0];
sub search_article
{
$file=$_[0];
chomp($file);
#print "file=$file\n";
$expword=$ARGV[1];
$exp_hash=();
@exp_w=split(",",$expword);
foreach $exp (@exp_w)
{
$exp_hash->{$exp}=1;
}
my $f=0;
my $n=0;
my $kk;
my $thash=();
#my @input_ar=`cat $file`;
my %my_hash;
my %my_hash2;
my %thash;
my $j;
#my $rand2=rand();
#open(FD_hash,">$file.txt");
#print "openfile $file\n";
open(FD_arr,"<$file");
my $first=0;
my $rand=rand();
open(FD_aro,">/tmp/$rand.sctxt");
my $j=1;
while(<FD_arr>)
{
#print FD_log $j,"----------\n";
#print $j,"----------\n";
my $in=$_;
chomp($in);
if(length($in)>500)
{ $in=substr $in,0,500;}
$in=~s/\@.*?\:/ /g;
$in=~s/\@.*?\s/ /g;
# $in=~s/机场|航班|深圳|弧形筛/ /g;
print FD_aro "$j ",$in,"\n";
$j++;
#print FD_log $j,"-p----",length($in),"\n";
#print $j,"-p----",length($in),"\n";
}
close FD_arr;
close FD_aro;
my $tot;
system("/home/wuyabo/xs/bin/scws -I -E -N -d /home/wuyabo/xs/etc/dict.utf8.xdb -r /home/wuyabo/xs/etc/rules.utf8.ini -c utf-8 /tmp/$rand.sctxt -o /tmp/$rand.sctxt2");
#$cc=`cat /tmp/$rand.sctxt`;
#print $cc,"$rand------";
#exit;
unlink("/tmp/$rand.sctxt");
open(FD_scws,"/tmp/$rand.sctxt2");
while(<FD_scws>)
#foreach my $in (@output)
{
my $in=$_;
#print FD_log "line $in \n";
#print "line $kk $in \n";
chomp($in);
#$input_ar[$f]=$in;
my $n_line;
my $tot;
my @arry=split('\ ',$in);
# while (($k,$v)=each(%my_hash2))
# {
# print FD_hash "$k,$v\t";
# }
# print FD_hash "\n";
foreach my $tp (@arry)
{
if(exists($w_c_v->{$tp}))
{
while (($k,$v)=each(%{$w_c_v->{$tp}}))
{
# print "class $k $v\n";
$tot->{$k}+=$v;
}
}
}
#print FD_log "totle$n_line=$n\n";
# $thash->{$n_line}=$n;
print FD_log $arry[0],"-";
my @key =sort {$tot->{$b} <=> $tot->{$a}} keys %{$tot};
my $o_n;
foreach $i (@key)
{
print FD_log $tot->{$i}.":$i\t";
}
print FD_log "\n";
}
unlink("/tmp/$rand.sctxt2");
}
# }
# );
#$poe_kernel->run();
sub txt2arr
{
my $input_t=$_[0];
my $exp_h=$_[1];
my $tmp_hash=();
# chomp($input_t);
my @list=split(" ",$input_t);
#print "txt2arr input =$input_t\n";
#$rand=rand();
#open(FD_r,">/tmp/$rand");
print FD_log "$input_t\n";
#close(FD_r);
#$dir="/home/wyb/stanford-parser-2012-07-09";
@res=();
#@res=`$dir/parser-10.pl /tmp/$rand 0 50`;
#$part=0;
#unlink("/tmp/$rand");
#print @res;
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $tmp_first=0;
foreach $tmp_res (@list)
{
if ($tmp_first==0)
{
$nline=$tmp_res;
$tmp_first++;
next;
}
# my $l=length($tmp_res);
# if($l<=3)
# {
#if($hash_sc->{$tmp_res}->{'a'} !~/n|v|a/)
#{
#print FD_log "seek $tmp_res\n";
#next;}
if(exists($hash_all->{$tmp_res}))
{
$tmp_hash->{$tmp_res}=clone($hash->{$tmp_res});
}
}
return (\%{$tmp_hash});
}
#print "output = $res\n";
# $heap->{client}->stop();
# $session->stop();
# Start the server.
sub comp_hash
{
$h1=@_[0];
$h2=@_[1];
$akv=0;
$k=();$v=();
my $all;
my %all;
my $vall=0.01;
my $vall2=0.01;
while(($k,$v)=each(%{$h1}))
{
$all->{$k}++;
$vall+=$v;
print FD_log " hash 1$k $v\n";
}
$k=();$v=();
while(($k,$v)=each(%{$h2}))
{
$all->{$k}++;
print FD_log " hash 2$k $v\n";
$vall2+=$v;
}
$k=();$v=();
while(($k,$v)=each(%{$all}))
{
if($v >= 2)
{
$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
$kv3=($kv+$kv2)/2;
#if($h1->{$k}<$h2->{$k})
#{
#}
print FD_log "double $k $v kv $kv kv2 $kv2\n";
$akv+=$kv3;
}
}
# print "---------------\n";
$akv;
}
sub ndate
{
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
$year=~s/1(.*)/20$1/;
$mon+=1;
if($mon=~/^\d$/)
{
$mon="0"."$mon";
}
if($mday=~/^\d$/)
{
$mday="0"."$mday";
}
if($hour=~/^\d$/)
{
$hour="0"."$hour";
}
if($min=~/^\d$/)
{
$min="0"."$min";
}
if($sec=~/^\d$/)
{
$sec="0"."$sec";
}
my $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
$res;
}
sub txt2arr_one
{
my $w=$_[0];
my $exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;
my @w_arr=split(",",$w);
foreach my $w_tmp (@w_arr)
{
if($w_tmp=~/(.*?) (.*)/)
{
$w=$1;
$vh=$2;
#print "$w -- $vh\n";
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each($hash->{$w}))
{
if($v==0)
{
$v=1;
}
$tmp_hash->{$k}=$vh;
}
}
$tmp_hash->{$w}=$w;
}
else
{
print "input error\n";
}
}
# while(($k,$v)=each(%{$tmp_hash}))
#
# {
#print "meaning $k $v\n";
# }
# }
return (%{$tmp_hash});
}