zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/eng_search_gouwei.pl
#!/usr/bin/perl
use Socket;
use IO::Handle;
use IO::Select;
use IO::Socket::INET;
use lib "./";
use plugin_eng;
use POE qw(Component::Server::TCP);
#use JSON::XS;
use Clone qw(clone);
use Encode;
#use MongoDB;
use Smart::Comments;
#use Conn_mongo_jc;
use Data::Dumper;
#use IO::File;
#use Add_info;
our @rewrite_g;
our @rewrite_l;
our @rewrite_n;
our %rewrite_all;
our $rewrite_all;
our $tfidf;
our @inputb_ar;
our @inputb_ar4;
our $input_ys;
#my $sel = IO::Select->new($socket); ##建立select对象
#system("./test_files2.py 1>/dev/null 2>/dev/null &");
#system("./post_server.pl &");
open(FDst,"stop.txt");
our $hash_st;
while(<FDst>)
{
my $line;
$line=$_;
chomp($line);
$hash_st->{$line}=1;
}
close FDst;
# http://poe.perl.org/?POE_Cookbook/TCP_Servers
# Include POE and POE::Component::Server::TCP.
our $hash=();
our $hash_bk=();
$hash_mn=();
my $hash_sc=();
#open(FD_sc, "dict.utf8.txt");
#open(FD_log, ">/tmp/engine.txt");
#while(<FD_sc>)
#{
#招收学员 13.87 8.50 n
# my $line=$_;
# if($line=~/(.*?)\t(.*?)\t(.*?)\t(.*)/)
# {
# $tt1=$1;
# $tt2=$2;
# $tt3=$3;
# $tt4=$4;
# $hash_sc->{$tt1}->{"i"}=$tt2;
# $hash_sc->{$tt1}->{"h"}=$tt3;
# $hash_sc->{$tt1}->{"a"}=$tt4;
#
# }
#}
#system("pwd");
#system("ls ../");
#print "@INC\n";
#print "$INC[1]\n";
#if(! -e "./word.txt")
#{print "dic file found ./\n";
#};
#if(! -e "$INC[1]/word.txt")
#{print "dic file found /\n";
#};
#if(! -e "../word.txt")
#{print "dic file found ../\n";
#};
#if(! -e "/home/lzj/shell2/word.txt")
#{print "dic file found /home/lzj/shell2/\n";
#};
open(FD, "0sy.txt");
my $sytmp;
while(<FD>)
{
my $line=$_;
if($line=~/\#(.*)/)
{
$tsy=$1;
if(!exists($sytmp->{$tsy}))
{
$sytmp->{$tsy}=$tsy;
$eat=1;
# print "id=$tsy\n";
}
else
{
$eat=0;
}
}
elsif($eat==1)
{
chomp($line);
if(length($tsy)<=3)
{next;}
if(length($line)<=3)
{next;}
$hash->{$line}->{$tsy}=0.2;
# $hash->{$tsy}->{$line}=0.2;
# print "-$tsy-$line\n";
}
}
close FD;
open(FD, "0word.txt");
my $tmp=();
while(<FD>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}
$line=$_;
chomp($line);
if($line=~/^W_C=(.*)/)
{
$tmp=$1;
#print $tmp,"\n";
$r=1;
}
elsif($line=~/^DEF=(.*)/)
{
$r=0;
$tmp_m=$1;
# print $tmp_m,"\n";
if(length($tmp)<=3)
{next;}
if(exists($hash_st->{$tmp}))
{next;}
my @arry=split('\|',$tmp_m);
foreach $tmps (@arry)
{
# print "-----------------$tmps ------------\n";
if($tmps=~/(.*?)\:/)
{
my $m=$1;
if(exists($hash_st->{$m}))
{next;}
# if($m eq "专")
if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件")||($m eq "地方"))
{next;}
# print "$tmp - $m-----\n";
if(exists($hash_sc->{$m}))
{
if($r==0)
{
$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
}
else{ $hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*0.2;}
#$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
if($hash->{$tmp}->{$m}<0.01){
$hash->{$tmp}->{$m}=0.1;}
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
else
{
$hash->{$tmp}->{$m}=0.1;
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
$hash_mn->{$tmp}+=0.1;
$r++;
}
elsif($tmps=~/(.*?)\}/)
{
my $m=$1;
# if($m eq "专")
#if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件"))
if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件")||($m eq "地方"))
{next;}
if(exists($hash_st->{$m}))
{next;}
if($r==0)
{
if(exists($hash_sc->{$m}))
{
$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
if($hash->{$tmp}->{$m}<0.01){
$hash->{$tmp}->{$m}=0.2;}
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
else
{
$hash->{$tmp}->{$m}=0.1;
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
$hash_mn->{$tmp}+=0.1;
}
else
{
if(exists($hash_sc->{$m}))
{
$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*(0.5);
if($hash->{$tmp}->{$m}<0.01){
$hash->{$tmp}->{$m}=0.2;}
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
else
{
$hash->{$tmp}->{$m}=0.1;
# print FD_log $hash->{$tmp}->{$m}," $m\n";
}
$hash_mn->{$tmp}+=0.2;
}
$r++;
}
}
}
}
open(FDb, "0babel_all_path_fin20.txt");
my $tmp=();
while(<FDb>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}
$line=$_;
$s=$line;
# $m=$2;
my $name;
@a=split("\t",$s);
my $dob=0;
#@b=split(" ",$m);
$f=0;
foreach $i (@a)
{
if($f==0)
{
$f++;
$name=$i;
if(length($name)<3)
{last;}
if(exists($hash->{$name}))
{
#print "$name\n";
$dob=1;
# last;
}
if(exists($hash_st->{$name}))
{last;}
}
else
{
if($i=~/(.*?)\:(.*)/)
{
my $av=$1;
my $bv=$2;
if(exists($hash->{$name}->{$av}))
{next;}
if($dob==1)
{
$hash->{$name}->{$av}=$bv*0.3;
}
else
{
$hash->{$name}->{$av}=$bv;
}
}
}
}
#foreach $i (@b)
#{
# $hash->{$name}->{$i}+=0.01;
#}
}
close(FDb);
sub reloaddic
{
#open(FDst,"stop.txt");
#our $hash_st;
#while(<FDst>)
#{
# my $line;
# $line=$_;
# chomp($line);
# $hash_st->{$line}=1;
#}
#close FDst;
#load_dic("babel_add.txt",30,0.5);
#load_dic("名词.txt",20,0.5);
#load_dic("动词.txt",10,0.5);
#load_dic("more.txt",20,1);
#our $hash={1=>1};
#our %hash={ 1=>1};
## $hash
print "reload $_[0]\n";
load_gg($_[0],$_[1]);
}
sub load_tfidf
{
open(FDb, "$_[0]");
my $tmp=();
while(<FDb>)
{
my $line=$_;
my @ta=split("\t",$line);
$tfidf->{$ta[0]}->{"t"}=$ta[1];
$tfidf->{$ta[0]}->{"i"}=$ta[2];
$tfidf->{$ta[0]}->{"ti"}=$ta[2]*$ta[1];
}
}
sub load_dic
{
open(FDb, "$_[0]");
my $lsize=$_[1];
my $zoomin=0.5;
if($_[2]>0)
{$zoomin=$_[2];}
my $tmp=();
while(<FDb>)
{
$line=$_;
$s=$line;
# $m=$2;
my $name;
@a=split("\t",$s);
my $dob=0;
#@b=split(" ",$m);
$f=0;
my $ccn=0;
foreach $i (@a)
{
if($f==0)
{
$f++;
$name=$i;
}
else
{
if($i=~/(.*?)\:(.*)/)
{
my $av=$1;
my $bv=$2;
#覆盖
if(exists($hash_st->{$av}))
{next;}
if($ccn>$lsize)
{last;}
$ccn++;
$hash->{$name}->{$av}=$bv*$zoomin;
}
}
}
}
close (FDb);
print "load $_[0] ok\n";
}
sub load_dic_back
{
open(FDb, "$_[0]");
my $lsize=$_[1];
my $zoomin=0.5;
if($_[2]>0)
{$zoomin=$_[2];}
my $tmp=();
while(<FDb>)
{
$line=$_;
$s=$line;
# $m=$2;
my $name;
@a=split("\t",$s);
my $dob=0;
#@b=split(" ",$m);
$f=0;
my $ccn=0;
foreach $i (@a)
{
if($f==0)
{
$f++;
$name=$i;
}
else
{
if($i=~/(.*?)\:(.*)/)
{
my $av=$1;
my $bv=$2;
#覆盖
if(exists($hash_st->{$av}))
{next;}
if($ccn>$lsize)
{last;}
$ccn++;
# $hash->{$name}->{$av}=$bv*$zoomin;
$hash_bk->{$av}->{$name}=1;
}
}
}
}
close (FDb);
print "load $_[0] ok\n";
}
#close FD_log;
#load_tfidf("dict.utf8.txt");
#load_dic("babel_all_path_fin20.txt",30,1);
#load_dic_back("babel_all_path_fin20.txt",1000,1);
#load_dic("base_total.txt",10,0.5);
#load_dic("babel_add.txt",10,0.6);
#load_dic("名词.txt",10,0.5);
#load_dic("shangpin.txt",20,0.7);
#load_dic("动词.txt",20,0.6);
#load_dic("more.txt",20,1);
#load_dic("zw.txt",10,0.2);
#reloaddic();
#load_rewrite();
our @inputdic;
#=`cat $ARGV[0]`;
our $input_gg;
our $input_gg_exp;
#all user gghash
my @alluME=glob("/mnt/sdb/shell2/*/ME_file");
foreach my $cfile(@alluME)
{
if($cfile=~/shell2\/(.*?)\/ME/)
{
my $una=$1;
load_gg($cfile,$una);
}
}
#load_gg("hsadd.txt");
sub load_gg
{
my $fg=$_[0];
my $user=$_[1];
delete($inputdic->{$user});
open(FDdic,$fg);
my $j=0;
while(<FDdic>)
{
my $nl=$_;
chomp($nl);
if($nl=~/^#/)
{
next;
}
# print "$nl nl\n";
$inputdic->{$user}[$j]=$nl;
## $nl
my @in;
#my $lin=seg_txt($nl);
my $fir;
if($nl=~/(.*?)\ .*/)
{$fir=$1;}
my $lin=decode("utf-8",$fir);
my @aa=split('|',$lin);
my @linea;
my $start=0;
my $tmp;
my $tmpn=0;
for(0 .. scalar(@aa)-1)
{
my $n=$_;
if($aa[$n]=~/^[a-z|A-Z]$/)
{
$start=1;
$tmp.=$aa[$n];
}
elsif($aa[$n]=~/\d/)
{
$start=1;
$tmp.=$aa[$n];
}
else
{
if($start==1)
{
push(@linea,$tmp);
push(@keyword,$aa[$n]);
$tmpn++;
$tmp="";
$start=0;
}
else
{
push(@linea,$aa[$n]);
$tmpn++;
}
}
}
my $ccc=length($tmp);
if($start==1)
{
push(@linea,$tmp);
$tmpn++;
$tmp="";
$start=0;
}
elsif($ccc>0)
{
push(@linea,$tmp);
$tmpn++;
$tmp="";$start=0;
}
my $bchash;
for(0 .. scalar(@linea-1))
{
my $na=$_;
if(exists($bchash->{$linea[$na]}))
{next;}
# if(exists($hash_st->{$linea[$na]}))
# {next;}
$hash->{$user}->{$linea[$na]}->{$j}=1/$tmpn;
$bchash->{$linea[$na]}=1;
#print "$hash->{$linea[$na]}->{$j} $linea[$na] $j\n";
}
$j++;
}
close FDdic;
}
#print "-----\n";
print "server start\n";
my $n=0;
#our $f=0;
POE::Component::Server::TCP->new(
Alias => "echo_server",
Port => 11019,
ClientInput => sub {
my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
#print "Session ", $session->ID(), " got input: ".$input."\n";
#print "Session ", $session->ID(), " got input: ".length($input)."\n";
my $d;
my $d_seq;
#open(FD_log,">/tmp/engine.log");
#print FD_log "$input\n";
#system("cp $input /tmp/");
print "input file $input",length($input),"\n";
my @res=();
my $noshudown=0;
if(length($input)<3){
$_[KERNEL]->yield("shutdown");
return;
}
if ($input=~/^reload_dic (.*?) (.*)/)
{
#@res=search_article2($input);
our $hash->{$2}={1=>"1"};
my $f=$1;
my $uuser=$2;
if(-e "$f")
{
reloaddic($f,$uuser);
}
#load_rewrite();
#my $psid=`ps aux|grep test_files2.py|grep -v grep|awk '{print \$2}'`;
#if($psid >0)
#{
#system("kill -9 $psid");
#print ("kill $psid\n");
#}
#system("./test_files2.py 1>/dev/null 2>/dev/null &");
#my $psid=`ps aux|grep test_files2.py|grep -v grep|awk '{print \$2}'`;
#print $psid,"\n";
#print FD_log "./test_files2.py 1>/dev/null 2>/dev/null &\n";
$res[0]="reload ok";
}
elsif ($input=~/^reload_plugin (.*)/)
{
my $kkk=$1;
delete $INC{"$kkk.pm"};
eval("require $kkk;");
print "require $kkk\n";
}
elsif($input=~/(.*) pinyin/)
{
#print "num ----------------- start\n";
my $kkk=$1;
@res=plugin_eng::pinyin($kkk);
}
elsif($input=~/(.*) rela/)
{
#print "num ----------------- start\n";
my $kkk=$1;
if(length($kkk)>2)
{
@res=get_w_rela($kkk);
}
else
{
@res;
}
}
elsif($input=~/(.*) num_n (\d+) (\d+)/)
{
#print FD_log "num ----------------- start\n";
my $kkk=$1;
@res=plugin_eng::num_n($kkk,$2,$3);
}
elsif($input=~/^dist_ys (.*?) (.*?)/)
{
my $i1=$1;
my $i2=$2;
#my $i3=$3;
#直接给出 2个句子/词的 距离
@res=dist_ys($i1,$i2);
#print "句子距离\n";
# $noshudown="yes";
}
elsif($input=~/^tfidf (.*?) (.*?) (.*)/)
{
my $i1=$1;
my $i2=$2;
my $i3=$3;
#
@res=tfidf($i1,$i2,$i3);
}
elsif($input=~/^dist (.*?) (.*)/)
{
#直接给出 2个句子/词的 距离
print "要素描述距离\n";
@res=dist($1,$2);
}
elsif($input=~/^get_ys (.*)/)
{
#直接给出 2个句子/词的 距离
$res[0]=get_ys($1);
}
elsif($input=~/^get_event (.*)/)
{
#事件抽取
my $i=$1;
$i=~s/\n//g;
my $res=get_event($i);
$heap->{client}->put($res);
$_[KERNEL]->yield("shutdown");
return ;
}
elsif($input=~/.* deep/)
{
#深度分析间接语义联系
@res=deep_article($input);
}
elsif($input=~/(.*) path/)
{
#分析相关路径
if(! -e $kkk)
{
$res[0]="no file ";
return $res[0];
}
my $kkk=$1;
my @res1=mean_path($input);
my @res2=deep_article_path($kkk);
@res=(@res1,@res2);
}
elsif($input=~/(.*) simple_path/)
{
#分析相关路径
my $kkk=$1;
my @tmpr;
if(! -e $kkk)
{
$res[0]="no file ";
}
else
{
my @tmpr=deep_article_path($kkk);
$res[0]=join(" ",@tmpr);
}
}
else
{
## $input
if($input=~/(.*?) showtop (\d+) (.*) (.*)/)
#@res=deep_article($input);
{
## 0000000000000000
@res=search_article($1,$2,"",$3,$4);
}
elsif($input=~/(.*) showtop (\d+)/)
{
@res=search_article($1,$2);
}
elsif($input=~/(.*) showtop_more (\d+) (\d)/)
#@res=deep_article($input);
{
@res=search_article($1,$2,$3);
}
else
{
@res=search_article($input);
## 00000000000000000000000000000000
}
}
#print FD_log @res;
#close(FD_log);
#print scalar(@res)," num\n";
## @res
$heap->{client}->put(@res);
#$session->yield("shutdown");
if($noshudown eq "yes")
{
}
else
{
$_[KERNEL]->yield("shutdown");
return;
}
},
ClientDisconnected => sub {
#print "Client disconnected\n"; # log it
}
#, ClientFlushed => sub {
# my $data_source = $_[HEAP]{file_handle};
# my $read_count = sysread($data_source, my $buffer = "", 65536);
# if ($read_count) {
# $_[HEAP]{client}->put($buffer);
# }
# else {
# print FD_log "------------------error\n";
# $_[KERNEL]->yield("shutdown");
# }
#}
);
POE::Kernel->run;
#$file=$ARGV[0];
sub search_article
{
my $file=$_[0];
chomp($file);
#print "file=$file\n";
my $showtop=$_[1]||100;
my $show_more=$_[2];
my $dic=$_[3];
my $user=$_[4];
my @input_all;
if(-e "$file")
{
@input_all=`cat $file`;
}
else{$input_all[0]=$file;}
my $j;
my $j=1;
my @m_res=();
my $kj=0;
#my $all=`cat `;
#while(<FD_scws>)
my $khash={};
my $bchash={};
my $mkhas={};
my $in=$input_all[0];
chomp($in);
# $in=seg_txt($in);
my $lin=decode("utf-8",$in);
my @aa=split('|',$lin);
my @keyword=();
my $start=0;
my $tmp;
for(0 .. scalar(@aa)-1)
{
my $n=$_;
if($aa[$n]=~/^[a-z|A-Z]$/)
{
$start=1;
$tmp.=$aa[$n];
}
elsif($aa[$n]=~/\d/)
{
$start=1;
$tmp.=$aa[$n];
}
else
{
if($start==1)
{
push(@keyword,$tmp);
push(@keyword,$aa[$n]);
$tmp="";
$start=0;
}
else
{
push(@keyword,$aa[$n]);
}
}
}
my $ccc= length($tmp);
if($start==1)
{
push(@keyword,$tmp);
$tmp="";
$start=0;
}
elsif($ccc>0)
{
push(@keyword,$tmp);
$tmp="";$start=0;
}
## $tmp
# my @keyword,$in);
## @keyword
my $outhash={};
my $totalk=scalar(@keyword);
my $totalkn={};;
my $mkhash={};;
my @utkw;
map{ push(@utkw,encode("utf-8",$_));} @keyword;
for(0 .. scalar(@keyword)-1)
{
my $kn=$_;
if(exists($bchash->{$keyword[$kn]}))
{next;}
else{$bchash->{$keyword[$kn]}=1;}
#print "nnnnnnnnnnnnn $keyword[$kn]\n";
if(exists($hash->{$user}->{$keyword[$kn]}))
{
while(my ($k,$v)=each(%{$hash->{$user}->{$keyword[$kn]}}))
{
$outhash->{$k}.=" ".$utkw[$kn];
#$outhash->{$k}.=" ".encode("utf-8",$keyword[$kn]);
$totalkn->{$k}++;
$khash->{$k}++;
#print "$keyword[$kn] $k $khash->{$k} 11111\n";
if($khash->{$k}>=2)
{
#print "$keyword[$kn] $k $khash->{$k} 2222222222222222222222222222222222222222222222222\n";
$mkhash->{$k} +=$v;
}
#$khash->{$keyword[$kn]}->{$k}++;
}
}
## $khash
}
## $khash
## $mkhash
my $tmpst;
my @key =sort {$mkhash->{$b} <=> $mkhash->{$a}} keys %{$mkhash};
my $all=scalar(@key);
if($all>0)
{
if($all>$showtop)
{
$all=$showtop;
}
my $nhas;
for(0 .. $all-1 )
{
my $nn=$_;
my $ln= $key[$nn];
## @key
# if($totalkn->{$ln} ==$totalk)
#my $m=$totalkn->{$ln}/$totalk;
#print "输出mm $inputdic->{$user}[$ln] 、$ln $nn\n";
my $simseq=seq_dif($input_all[0],$inputdic->{$user}[$ln]);
# push(@m_res,$inputdic[$ln]." ".$outhash->{$ln});
$tmpst->{$ln}=$mkhash->{$ln}+$simseq;
#$nhash->{$ln}=$inputdic->{$user}[$ln];
}
my @key2 =sort {$tmpst->{$b} <=> $tmpst->{$a}} keys %{$tmpst};
## @key2
foreach my $sone (@key2)
{
push(@m_res,$inputdic->{$user}[$sone]."<inc>".$outhash->{$sone}."<ac>$m<sort>$tmpst->{$sone}");
}
}
else
{
while(my($k,$v)=each(%{$khash}))
{
my $kkk=$outhash->{$k};
$kkk=~s/ //g;
$kkk=decode("utf-8",$kkk);
my $m=$hash->{$user}->{$kkk}->{$k};
$khash2->{$k}=$m;
}
my @key2 =sort {$khash2->{$b} <=> $khash2->{$a}} keys %{$khash2};
## @key2
my $all2=scalar(@key2);
if($all2>$showtop)
{$all2=$showtop;}
if($all2>0)
{
for(0 .. $all2-1 )
{
my $nn=$_;
my $ln= $key2[$nn];
#print "输出khash $inputdic->{$user}[$ln] 、$ln $$nn\n";
# push(@m_res,$inputdic[$ln]);
my $kkk=$outhash->{$ln};
$kkk=~s/ //g;
$kkk=decode("utf-8",$kkk);
## $kkk
## $ln
my $m=$hash->{$user}->{$kkk}->{$ln};
# while(($k,$v)=each(%{$hash->{$user}}))
# {
# print "$k $v kkknnn\n";
# }
## $m
## $hash
#my $m=$totalkn->{$ln}/$totalk;
# my $simseq=seq_dif($input_all[0],$inputdic->{$user}[$ln]);
push(@m_res,$inputdic->{$user}[$ln]."<inc>".$outhash->{$ln}."<ac>$m");
}
}
}
## $dic
if($dic eq "y")
{
push(@m_res,"================\n");
my @m_res2;
my $mchash;
my $rand=rand();
open(FDr,">/tmp/$rand");
foreach my $one(@m_res)
{
if($one=~/(.*?)<inc>/)
{
print FDr "$1";
push(@m_res2,$1);
}
}
my $run=`./client_qc_top.pl /tmp/$rand /tmp/$rand.2 start`;
my $run=`cat /tmp/$rand.2`;
unlink("/tmp/$rand");
unlink("/tmp/$rand2");
## $run
my @rn=split("\t",$run);
for(0 .. scalar(@rn)-1)
{
my $wn=$_;
my $wo=$rn[$wn];
$wo=~s/(.*?)\:.*/$1/g;
for( 0 .. scalar(@m_res2)-1)
{
my $n=$_;
my $sone=$m_res2[$n];
## $sone
if($sone=~/$wo/)
{
$mchash->{$wo}->{"n"}++;
push(@{$mchash->{$wo}->{'list'}},$n)
}
}
}
push(@m_res,"相关线索 $run\n");
while( my ($k,$v)=each(%{$mchash}))
{
if($v->{"n"} >=2)
{
# my $tline;
my $vn=$v->{"n"};
foreach my $nn (@{$mchash->{$k}->{'list'}})
{
# $tline=$m_res2[$nn];
$m_res2[$nn]=~s/$k/\<$k $vn\>/g;
# push(@m_res,$tline);
## $vn
## $k
}
}
}
#./client_qc_top.pl res 10 10
push(@m_res,@m_res2);
}
@m_res;
}
# }
# );
#$poe_kernel->run();
sub txt2arr
{
my $input_t=$_[0];
my $exp_h=$_[1];
my $tmp_hash=();
# chomp($input_t);
my @list=split(" ",$input_t);
#print "txt2arr input =$input_t\n";
#$rand=rand();
#open(FD_r,">/tmp/$rand");
#print FD_log "$input_t\n";
#close(FD_r);
#$dir="/home/wyb/stanford-parser-2012-07-09";
@res=();
#@res=`$dir/parser-10.pl /tmp/$rand 0 50`;
#$part=0;
#unlink("/tmp/$rand");
#print @res;
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $tmp_first=0;
foreach $tmp_res (@list)
{
if ($tmp_first==0)
{
$nline=$tmp_res;
$tmp_first++;
next;
}
if(exists($hash_st->{$tmp_res}))
{next;}
my $l=length($tmp_res);
# if($l<=3)
# {
#if($hash_sc->{$tmp_res}->{'a'} !~/n|v|a/)
#{
#print FD_log "seek $tmp_res\n";
#next;}
if($l<3)
{next;}
#主题放大
$tmp_hash->{$tmp_res}++;
# }
# print $tmp_res,"---",length($tmp_res),"\n";
#if (($part==1)&&($tmp_res=~/(.*?),(.*)/))
#{
#print "切词1=$1,2=$2\n";
my $w=$tmp_res;
# my $value=$2;
# if(exists($exp_h->{$w}))
# {
# print "$w\n";
# next;}
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each(%{$hash->{$w}}))
{
if(exists($hash_st->{$k}))
{next;}
$tmp_hash->{$k}+=$v;;
#$tmp_hash->{$k}=$v;
#$tmp_hash->{$k}=$hash_sc->{$w}->{'h'};
# print FD_log "正常词 $w 转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k}," $k\n";
#正常词 交 转换0.00--0.00 相互
#if(exists($sytmp->{$k}))
#{
# print "归一 $k ",$sytmp->{$k},"\n";
# $k=$sytmp->{$k};
#}
# if($value>20)
# $heap->{client}->put("$k $v\n");
}
}
}
return (\%{$tmp_hash},$nline);
}
#print "output = $res\n";
# $heap->{client}->stop();
# $session->stop();
# Start the server.
sub comp_hash
{
my $h1=@_[0];
my $h2=@_[1];
my $h3=@_[2];
my $h4=@_[3];
my $org1=@_[4];
my $org2=@_[5];
#while(($k,$v)=each(%{$h3}))
# {
#print FD_log "h3 show $k, $v\n";
# }
#while(($k,$v)=each(%{$h4}))
# {
#print FD_log "h4 show $k, $v\n";
# }
my $akv=0;
my $all;
my $an_all;
my %all;
my $vall=0.001;
my $vall2=0.001;
my $k,$v;
while(($k,$v)=each(%{$h1}))
{
$all->{$k}++;
if($v>0)
{
$an_all++;
$vall+=$v;
}
# print " hash 1 k $k v $v\n";
}
my $k,$v;
while(($k,$v)=each(%{$h2}))
{
if($v>0)
{
$all->{$k}++;
$an_all++;
#$vall2++;
$vall2+=$v;
}
# print " hash 2 k $k v $v\n";
}
my $k,$v;
my $samelist;
my $std_n;
while(($k,$v)=each(%{$all}))
{
if($v >= 2)
{
# my $sum;
#if($h1->{$k}>$h2->{$k})
#{$sum=$h2->{$k};}
#else
#{$sum=$h1->{$k};}
#$kv=$sum/($vall+$vall2);
if($h1->{$k}<0)
{next;}
$an_all=$an_all-2;
# 平均算法 适用于多词找文 对词典中噪音有包容性 #$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
#$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
$kv3=($kv+$kv2)/2;
# log2算法 适用于文找文 $kv=($h1->{$k}+$h2->{$k})/($vall+$vall2+log2($an_all));
#$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2+log2($an_all)/2);
#$kv3=$kv;
#$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
#if($h1->{$k}<$h2->{$k})
#{
#}
if(($h1->{$k}>2)&&($h2->{$k}>2))
{$akv+=0.12;}
$akv+=$kv3;
# print "double $akv $k $v kv sum= $h1->{$k}+$h2->{$k} ($org1->{$k}, $org2->{$k}) kv= $kv val1= $vall val2= $vall2 kv2=$kv2 kv3 =$kv3 std_n=$std_n\n";
# print "相似语义 $k 来自前句 from ";
#while(($kc,$vc)=each($org1->{$k}))
#{
# print "$kc,";
#}
#print " 来自后句 ";
if (exists($org2->{$k}))
{
while(($kc,$vc)=each($org2->{$k}))
{
print "$kc,";
}
print "\n";
}
$samelist->{$k}=$h3->{$k};
$std_n++;
}
else{$akv -="0.001";}
}
#序列相似评估
#---------------------------------------
if(($std_n>0)&&($std_n !=1))
{
# make samelist ->std test_seq->obj_seq;
my @key =sort {$samelist->{$b} <=> $samelist->{$a}} keys %{$samelist};
#print @key;
my $std_seq;
my $t_n=1;
my $test_seq;
my $last_tmp;
my $my_fi=0;
my $std_sq;
foreach $i (@key)
{
if(exists($std_sq->{$samelist->{$i}}))
{
$std_seq->{$i}=($std_n-$t_n+1);
$test_seq->{$i}=$h4->{$i};
# print FD_log "def test_seq $i =$h4->{$i}\n";
# print FD_log "std_seq last $i seq =$std_seq->{$i} h3 $h3->{$i} std_n=$std_n\n";
}
else
{
$t_n++;
$std_sq->{$samelist->{$i}}=1;
$std_seq->{$i}=($std_n-$t_n+1);
# print FD_log "std_seq $i seq=$std_seq->{$i}\n";
# print FD_log "def test_seq $i =$h4->{$i}\n";
$test_seq->{$i}=$h4->{$i};
}
}
my @key =sort {$test_seq->{$b} <=> $test_seq->{$a}} keys %{$test_seq};
my $t_n=1;
my $last_tmp;
my $obj_tmp;
my $test_sq;
foreach $i (@key)
{
# print FD_log "$i \"$last_tmp\" == \"$test_seq->{$i}\" ??\n";
# if($last_tmp eq $test_seq->{$i})
if(exists($test_sq->{$test_seq->{$i}}))
{
$obj_seq->{$i}=($std_n-$t_n+1);
# print FD_log "test_seq last $i seq= $obj_seq->{$i} std_n=$std_n\n";
}
else
{
$t_n++;
$test_sq->{$test_seq->{$i}}=1;
$obj_seq->{$i}=($std_n-$t_n+1);
}
# test_seq 受体拮抗剂 last_tmp = 26 25
# print FD_log "test_seq $i seq= $obj_seq->{$i}\n";
}
my $diff_sum=0;
while(($k,$v)=each(%{$std_seq}))
{
$diff_sum=$diff_sum+abs($v-$obj_seq->{$k});
# print FD_log "diff_sum $diff_sum $k $v $obj_seq->{$k}\n";
}
my $sam_rate=1-($diff_sum/(($std_n-1)*($std_n-1)));
my $diff_n=$akv*(1-$sam_rate)*0.15;
$akv=$akv-$diff_n;
#print FD_log "seq diff sam_rate $sam_rate akv $akv diff_n $diff_n\n";
}
#-----------------------------------------------
# 异议评估 说多了远的,深意不在这里
# my $maxd=log2($an_all+1);
# if($maxd>100)
# {$akv=$akv*0.618;}
# elsif($maxd==0)
# {
# }
# elsif($maxd>1)
# {
# $akv=$akv*(0.8+0.2/($maxd+1));
# }
# print "---------------\n";
$akv;
}
sub ndate
{
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
$year=~s/1(.*)/20$1/;
$mon+=1;
if($mon=~/^\d$/)
{
$mon="0"."$mon";
}
if($mday=~/^\d$/)
{
$mday="0"."$mday";
}
if($hour=~/^\d$/)
{
$hour="0"."$hour";
}
if($min=~/^\d$/)
{
$min="0"."$min";
}
if($sec=~/^\d$/)
{
$sec="0"."$sec";
}
my $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
$res;
}
sub txt2arr_one
{
my $w=$_[0];
my $exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;
my @w_arr=split(",",$w);
foreach my $w_tmp (@w_arr)
{
if($w_tmp=~/(.*?) (.*)/)
{
$w=$1;
$vh=$2;
#print "$w -- $vh\n";
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each(%{$hash->{$w}}))
{
if($v==0)
{
$v=1;
}
$tmp_hash->{$k}=$vh;
}
}
$tmp_hash->{$w}=$w;
}
else
{
print "input error\n";
}
}
# while(($k,$v)=each(%{$tmp_hash}))
#
# {
#print "meaning $k $v\n";
# }
# }
return (%{$tmp_hash});
}
sub txt2arr_input
{
my $input_t=$_[0];
my $exp_h=$_[1];
my $tmp_hash=();
my $tmp_sequ;
my $sequ=0;
my @list=split(" ",$input_t);
print FD_log "$input_t\n";
@res=();
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $tmp_first=0;
my $og;
my $linst_len;
$list_len=scalar(@list);
for (0 .. ($list_len-1))
#foreach $tmp_res (@list)
{
$nown=$_;
$tmp_res=$list[$_];
if ($tmp_first==0)
{
$nline=$tmp_res;
$tmp_first++;
next;
}
my $l=length($tmp_res);
if(($l<3)&&($l!~/^[a-z|A-Z]/))
{next;}
if(exists($hash_st->{$tmp_res}))
{next;}
if($tmp_res=~/^\d/)
{next;}
if($tmp_res=~/^-/)
{next;}
#原始词保留
#主题放大
my $zs=1;
#-----------------------------------------------------------------------
$w=$tmp_res;
# if(($list[$nown+1]=~/^\d/)&&(length($input_t)<10))
# {
# $tmp_hash->{$w}+=2*$list[$nown+1];
# $zs=$list[$nown+1];
# }
# elsif(($list[$nown+1]=~/^-/))
# {
# $tmp_hash->{$w}-=2*$list[$nown+2];
# $zs=-$list[$nown+2];
# }
# else
# {
#控制重复词语数量
# if($tmp_hash->{$w}>($list_len/20))
#if($tmp_hash->{$w}>($list_len/10))
# {next;}
# else
# {$tmp_hash->{$w}+=2;
# $og->{$w}->{$w}=1;
# }
$tmp_hash->{$w}+=5;
# $tmp_sequ->{$w}=$sequ;
# $sequ++;
# }
#----------------------------------------------------------
##序列
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each(%{$hash->{$w}}))
{
$tmp_hash->{$k}+=$v*$zs;
$og->{$k}->{$w}=1;
$tmp_sequ->{$k}=$sequ;
# print FD_log "正常词 $w 转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"k=$k sequ=$sequ\n";
}
}
else
{
$tmp_hash->{$w}+=0.1;
$og->{$w}->{$w}=1;
$tmp_sequ->{$k}=$sequ;
# print FD_log "正常词 $w 转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"k=$k sequ=$sequ\n";
}
}
return (\%{$tmp_hash},$nline,\%{$tmp_sequ},\%{$og});
}
sub txt2arr3
{
#需要改进为 先判断文本域 再根据域 重写词向量值 域向量集 可以用clone覆盖已有向量
#例如黑话库 的词依存 规则
my $input_t=$_[0];
my $exp_h=$_[1];
my $tmp_hash=();
my $tmp_hash_seq=();
my @list=split(" ",$input_t);
print FD_log "$input_t\n";
@res=();
my $og;
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $seq=0;
my $tmp_first=0;
my $list_len;
$list_len=scalar(@list);
foreach $tmp_res (@list)
{
if ($tmp_first==0)
{
$nline=$tmp_res;
$tmp_first++;
next;
}
my $l=length($tmp_res);
#if($l<=3)
#{next;}
if(($l<3)&&($l!~/^[a-z|A-Z]/))
{next;}
if(exists($hash_st->{$tmp_res}))
{next;}
#原始词保留
#主题放大
$w=$tmp_res;
# if($tmp_hash->{$w}>($list_len/10))
#if($tmp_hash->{$w}>($list_len/10))
# {next;}
# else
if(exists($tmp_hash->{$w}))
{
# $tmp_hash->{$w}+=0.1;
}
else
{$tmp_hash->{$w}+=2.1;
$og->{$w}->{$w}=1;
}
$tmp_hash_seq->{$w}=$seq;
$seq++;
if(exists($hash->{$w}))
{
my $t_n=0;
while(($k,$v)=each(%{$hash->{$w}}))
{
$tmp_hash->{$k}+=$v;
$tmp_hash_seq->{$k}=$seq;
$og->{$k}->{$w}=1;
# print FD_log "正常词 $w 转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k}," $k seq=$seq\n";
}
}
else
{
$tmp_hash->{$w}+=0.1;
$og->{$w}->{$w}=1;
$tmp_hash_seq->{$w}=$seq;
# print FD_log "正常词 $w 转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k}," $k seq=$seq\n";
}
}
return (\%{$tmp_hash},$nline,\%{$tmp_hash_seq}, \%{$og});
}
sub log2 {
my $n = shift;
return log($n)/log(2);
}
sub rewrite_gx
{
my $inp=$_;
my @re_res;
my @tl=split(" ",$inp);
my $otl;
my $has_re;
my $has_n=0;
my $no_has;
my $r;
print "scalar= ",scalar(@tl),"\n";
push(@re_res,$tl[0]);
for(1 ..(scalar(@tl)-1))
{
my $n=$_;
my $tmp=$tl[$_];
if(exists($rewrite_all->{$tmp}))
{
$has_re->{$tmp}=$rewrite_all->{$tmp};
$has_local->{$tmp}=$tmp;
$has_time->{$tmp}=$has_n;
$has_n++;
}
}
if($has_n<2){ print "rewrite skip $has_n $inp\n";return $inp;}
my $exists_tkl;
while(($k,$v)=each(%{$has_re}))
{
my @kl=split(" ",$v);
foreach my $tkl (@kl)
{
my $is_rewrite=1;
if(($rewrite_n[$tkl]>$has_n)||(exists($exists_tkl->{$tkl})))
{
next;
}
while(my($k1,$v1)=each(%{$rewrite_g[$tkl]}))
{
if(exists($has_re->{$k1}))
{}
else{$is_rewrite=0;last;}
}
if($is_rewrite==1)
{
$exists_tkl->{$tkl}=1;
push (@re_res,$rewrite_l[$tkl]);
while(my($k1,$v1)=each(%{$rewrite_g[$tkl]}))
{ $no_has->{$k1}=1;}
}
}
}
my $f=0;
foreach my $tt (@tl)
{
if($f==0){$f++;next;}
if(exists($no_has->{$tt}))
{
next;
}
push(@re_res,$tt);
}
$r=join(" ",@re_res);
return $r;
}
sub load_rewrite
{
my $rg=0;
my $name;
my $rg_hs;
open(FDre,"rewrite.txt");
while(<FDre>)
{
my $line=$_;
my @a=split("\t",$line);
my $f=0;
foreach $i (@a)
{
if($f==0)
{
$f++;
$rg++;
$name=$i;
if(length($name)<3)
{last;}
$rewrite_l[$rg]=$name;
my @rew=split("_",$name);
foreach my $t_rew (@rew)
{
$rewrite_n[$rg]++;
$rewrite_g[$rg]->{$t_rew}=1;
# $rg_hs[$rg]->{$t_rew}=$rg;
if(exists( $rewrite_all->{$t_rew}))
{
my $t_add=$rewrite_all->{$t_rew}." ".$rg;
$rewrite_all->{$t_rew}=$t_add;
}
else{$rewrite_all->{$t_rew}=$rg;}
}
}
else
{
if($i=~/(.*?)\:(.*)/)
{
my $av=$1;
my $bv=$2;
$hash->{$name}->{$av}=$bv;
}
}
}
}
}
sub deep_article_path
{
my $file=$_[0];
#chomp($file);
my @m_res;
my $f=0;
my $n=0;
my $kk;
my $thash=();
my @thash;
open(FD_scws,"<$file");
my $key1=<FD_scws>;
chomp($key1);
my $innext=<FD_scws>;
my $key2=<FD_scws>;
chomp($key2);
#print "----$key1|$key2\n";
if(exists($hash->{$key1}))
{
while(my ($nk,$nv)=each($hash->{$key1}))
{
if(exists($hash->{$key2}))
{
while(my($nk2,$nv2)=each($hash->{$key2}))
{
if($nk eq $nk2)
{
push(@m_res,"same $nk");
}
}
}
}
}
unlink("/tmp/$rand.sctxt2");
#close(FD_hash);
@m_res;
}
sub get_ys
{
my $la=$_[0];
my $lres=$input_ys->{$la};
$lres;
}
sub dist
{
my @m_res;
my $la=$_[0];
my $lb=$_[1];
print $la,"-------------",$lb,"\n";
#my ($my_hash,$n_line,$my_hash2_seq,$og2)=txt2arr3($in1,$exp_hash);
#参数1 文件名
#参数2 ID列表“-”间隔
my @in_g=`cat $la`;
my $in_gg=$in_g[0];
chomp($in_gg);
my $ingg=seg_txt($in_gg);
my ($my_hash)=txt2arr3("0 $ingg",$exp_hash);
## $my_hash
my @list=split("-",$lb);
for(0 .. scalar(@list)-1)
{
my $nn=$_;
my $exp;
if(exists($input_gg->{$list[$nn]}))
{
my $exn=scalar(@{$input_gg->{$list[$nn]}});
for(0 .. $exn-1)
{
my $ssn=$_;
my $diff=$input_gg->{$list[$nn]}[$ssn];
my $n=comp_hash($my_hash,$diff);
## $diff
push (@m_res,"$list[$nn] $n ".$input_gg_exp->{$list[$nn]}[$ssn]."\n");
}
}
}
@m_res;
}
sub comp_pin
{
my $a=$_[0];
my $b=$_[1];
print "a b $a $b\n";
foreach my $ap (@{$pinyinhash->{$a}})
{
foreach my $bp (@{$pinyinhash->{$b}})
{
if($bp eq $ap)
{
# print "sw $a-$b-$ap-$bp\n";
return "1";}
}
}
# print "dw $a-$b\n";
return "0";
}
sub seg_txt
{
my $txt=$_[0];
my $socket = IO::Socket::INET->new(
PeerAddr => "127.0.0.1",
PeerPort => "11229",
Type => SOCK_STREAM,
Proto => "tcp",
)
or die "Can not create socket connect.$@";
$socket->autoflush(1);
$sel = IO::Select->new($socket);
$socket->send("$txt\n",0); ##发送消息至服务器端。
#print "分词 $txt\n";
my $i;
# print "$i socket\n";
while (my @ready = $sel->can_read) { ##等待服务端返回的消息
foreach my $fh (@ready) {
if ($fh == $socket) {
while (<$fh>) {
$i=$_;
#print "read====$i==================\n";
}
$sel->remove($fh);
close $fh;
}
}
}
$i;
}
sub tfidf
{
#
#输入一句话 返回前n个核心词 n默认为1
my $top=$_[0];
my $tori=$_[1];
my $txt=$_[2];
#print "tfidf txt $txt\n";
if(length($tori)<1)
{$tori="ti";}
if(length($top)<1)
{$top=1;}
chomp($txt);
my $st=seg_txt($txt);
chomp($txt);
my @list=split(" ",$st);
my $tlist;
my $res;
foreach my $i (@list)
{
#print "$i foreach \n";
if(exists($tfidf->{$i}))
{
$tlist->{$i}=$tfidf->{$i}->{$tori};
}
}
my @key =sort {$tlist->{$b} <=> $tlist->{$a}} keys %{$tlist};
#print @key;
my $o_n=0;
foreach $i (@key)
{
$res .="$i:$tlist->{$i} ";
if($o_n>=$top)
{last;}
$o_n++;
}
return $res;
}
sub get_w_rela
{
my $in=$_[0];
my @br;
if(exists($hash_bk->{$in}))
{
while(my ($k,$v)=each(%{$hash_bk->{$in}}))
{
push(@br,$k);
}
}
my $bres=join(" " ,@br);
return $bres;
}
sub get_event
{
my $in=$_[0];
$in=seg_txt($in);
#叙事性事件抽取
my $time;
my $place;
my $name;
my $event;
my $path;
my $subs;
$time=get_time($in);
#$place=get_place($in);
# $name=get_name($in);
# $event=get_event($in);
# $path=get_event($in);
#my @res=($time,$place,$name,$event,$path,$subs);
#return(@res);
return($time);
}
sub get_time
{
my $in=$_[0];
my $timein="\:|\\|\/|\-|夏|冬|上|大|百|第|两|之|来|右|晨|昨|春|年|二|半|纪|过|历|八|度|三|四|日|稍|星|少|今|午|早|刻|差|夜|长|明|月|间|万|秒|马|末|五|候|去|数|一|段|七|现|期|感|多|在|点|号|和|左|好|千|本|六|的|周|零|当|充|节|近|届|苏|成|天|底|未|圣|诞|头|世|几|时|钟|凌|晚|后|新|小|九|岁|久|分|清|下|每|季|前|这|个|那|满|古|十";
my @wr=split(" ",$in);
my @hash_hc;
my @tmp_str;
my $m_res;
my $has=0;
my $start=0;
for(0 .. scalar(@wr)-1)
{
my $zu=$_;
if(($start==1)&&($wr[$zu]!~/\d|$timein/))
{$start=0;
my $onestr=join("",@tmp_str);
$m_res .=$onestr." ";
$pos_nu->{$onestr}=1;
# foreach my $ff( @hash_hc)
# {
# my $s=$wr[$ff];
# if($onestr !~/$s/)
# {
# $m_res .=$wr[$ff]." ";
# }
# }
@hash_hc=();
@tmp_str=();$has=0;
}
if($wr[$zu]!~/\d| |一|二|三|四|五|六|七|八|九|十|百|千|万|亿|零|\.|点|$timein/)
{next;}
my $wr_uni=decode("utf-8",$wr[$zu]);
my @zi=split("",$wr_uni);
my $hash=0;
for(0 .. scalar(@zi)-1)
{
my $zone=$zi[$_];
my $zone=encode("utf-8",$zone);
if($zone=~/\d| |一|二|三|四|五|六|七|八|九|十|百|千|万|亿|$timein/)
{
$start=1;
}
if(($start==1)&&($zone=~/\d|一|二|三|四|五|六|七|八|九|十|百|千|万|亿|零|\.|点|$timein/))
{
push (@tmp_str,$zone);
if(($has ==0 )||($hash_hc[$has-1] != $zu))
{$hash_hc[$has]=$zu;$has++;
}
}
elsif(($start==1)&&($zone !~/\d| |一|二|三|四|五|六|七|八|九|十|百|千|万|亿|零|\.|点|$timein/))
{$start=0;
my $onestr=join("",@tmp_str);
$m_res .=$onestr." ";
$pos_nu->{$onestr}=1;
# foreach my $ff( @hash_hc)
# {
#$m_res .=$wr[$ff]." ";
# }
@hash_hc=();
@tmp_str=();$has=0;
}
}
if(($start ==1)&&($zu== (scalar(@wr)-1)))
{
$start=0;
my $onestr=join("",@tmp_str);
$m_res .=$onestr." ";
$pos_nu->{$onestr}=1;
foreach my $ff( @hash_hc)
{
my $s=$wr[$ff];
if($onestr !~/$s/)
{
# $m_res .=$wr[$ff]." ";
}
}
@hash_hc=();
@tmp_str=();
$has=0;
print "结束捕获 清算 句子开始$zu $wr[$zu] @hash_hc| @tmp_str|$onestr\n";
}
}
# push(@wline,$m_res);
my @as=split(" ",$m_res);
my $my_res;
my $tihash;
foreach my $ones(@as)
{
if(length($ones)>4)
{
if(exists($tihash->{$ones}))
{next;}
$my_res .=$ones." ";
$tihash->{$ones}=1;
}
}
$my_res;
}
sub dist_ys
{
my @m_res;
my $la=$_[0];
my $lb=$_[1];
#my $seg=$_[2];
if(1)
{
#my $a=`./client_qc_seg.pl $la`;
#my $b=`./client_qc_seg.pl $lb`;
my $a=seg_txt($la);
my $b=seg_txt($lb);
chomp($a);
chomp($b);
my $ina="0 $a";
my $inb="1 $b";
# print "$ina,$inb inainb\n";
#$ina=decode("utf-8",$ina);
#$inb=decode("utf-8",$inb);
my ($my_hash)=txt2arr3($ina,$exp_hash);
my ($my_hash2)=txt2arr3($inb,$exp_hash);
$n=comp_hash($my_hash,$my_hash2);
$m_res[0]="0-1-$n";
}
else
{
my $ina="0 $la";
my $inb="1 $lb";
#my ($my_hash,$n_line,$my_hash2_seq,$og2)=txt2arr3($in1,$exp_hash);
my ($my_hash)=txt2arr3($ina,$exp_hash);
my ($my_hash2)=txt2arr3($inb,$exp_hash);
$n=comp_hash($my_hash,$my_hash2);
$m_res[0]="0-1-$n";
}
@m_res;
}
#4——9支持 stop.txt 其中加了若干标点
# jieba分词 分割文件 bug修正
# jieba分词 实现多socket的全内存调用 待高并发测试
# 添加了client_qc.pl用来支持 分类引擎与切词引擎的数据交换
#添加了 若干词到 词典文件
#计划加入 每条 目标与待测文本 进行域向量修订(共现放缩) 域词典 数量为N
#4-12 修正词典中的标点,和去掉一些单字
#测试自然对数 作为衡量相似度的
# 4-14修正 reload 命令同时重启切词系统
#4-20 comp_hash加入相似特征的序列对比 当序列相近则认为相似度更近
#加入同义词词典
#4-23 单字支持做调整 修正了停用词表的bug 修正了文本中有----的bug
#4-29 修正输入 文章中有数字影响 权重的bug
#4-30加入post数据接口 加入 共现词组转义rewrite函数
#添加重复词语数量控制 为20个词中 最多计数一个
#添加了 各种品牌
#向量空间减少到60个
#加入目标语句的深度含义分析
#计划加入 实体路径的最大关联分析
#加入dist函数 直接分析2个词的距离
#支持模糊推理
#调整扩展语义权重不大于2
#加入 词的tfidf比较
sub seq_dif
{
my $good=0;
my $ssf=$_[0];
my $ssn=$_[1];
my @ssa=split("|",$ssf);
# my @ssb=split("|",$ssn);
my $t=scalar(@ssa)-1;
my $sgood;
my $seqgood=0;
for(0 .. $t)
{
my $pon=$_;
my $noww=$ssa[$pon];
my $noww2;
my $po=index($ssn,$noww);
for($pon+1 .. $t )
{
$noww2=$ssa[$_];
my $po2=index($ssn,$noww2);
#print $ssn," $po2\n";
if($po2>$po)
{
$good++;
}
if($pon>=1)
{
my $lianxu=$ssa[$pon-1]. $ssa[$pon];
$sgood=index($ssn,$lianxu);
if($sgood>0)
{
$seqgood++;
}
}
# print $po,"\n";
}
}
($good+$seqgood)/(($t)*($t+1));
}