Group
Extension

zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/eng_org_change2.pl

#!/usr/bin/perl
use POE qw(Component::Server::TCP);
#use JSON::XS;
 use Clone qw(clone);

use URI::Escape;

#use MongoDB;
#use Smart::Comments;
#use lib "/home/wuyabo/shell/";
#use Conn_mongo_jc;
use Data::Dumper;
#use IO::File;
#use Add_info;
our @rewrite_g;
our @rewrite_l;
our @rewrite_n;
our %rewrite_all;
our $rewrite_all;

open(FDst,"stop.txt");
#our $hash_st;
while(<FDst>)
{
	my $line;
	$line = $_;
	chomp($line);	
	$hash_st->{$line}=1;

}
close FDst;

# http://poe.perl.org/?POE_Cookbook/TCP_Servers


# Include POE and POE::Component::Server::TCP.
 our $hash=();
 $hash_mn=();


my $hash_sc=();
open(FD_log, ">/home/htc/nord/recommend/engine.txt");

open(FD, "0sy.txt");
my $sytmp;
while(<FD>)
{
	my $line=$_;
    if($line=~/\#(.*)/)
    {
                 $tsy=$1;
                if(!exists($sytmp->{$tsy}))
                {
                        $sytmp->{$tsy}=$tsy;
                        $eat=1;

#                       print "id=$tsy\n";
                }
                else
                {

                        $eat=0;
                }
        }
        elsif($eat==1)
        {
                chomp($line);
		if(length($tsy)<=3)
		{next;}
		if(length($line)<=3)
		{next;}
                $hash->{$line}->{$tsy}=0.2;
#                $hash->{$tsy}->{$line}=0.2;
#                       print "-$tsy-$line\n";

        }

}
close FD;

open(FD, "0word.txt");
	my	$tmp=();
while(<FD>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}

        $line=$_;
        chomp($line);
	
        if($line=~/^W_C=(.*)/)
        {
                $tmp=$1;
                #print $tmp,"\n";
                $r=1;

        }
        elsif($line=~/^DEF=(.*)/)
        {
                $r=0;
                $tmp_m=$1;
               # print $tmp_m,"\n";
 		if(length($tmp)<=3)
		{next;}
			if(exists($hash_st->{$tmp}))
			{next;}
               my  @arry=split('\|',$tmp_m);
                foreach $tmps (@arry)
                {
               #        print "-----------------$tmps ------------\n";
                        if($tmps=~/(.*?)\:/)
                        {
                        my        $m=$1;
			if(exists($hash_st->{$m}))
			{next;}
#		if($m eq "专")
  if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件")||($m eq "地方"))
		{next;}
                #                print "$tmp - $m-----\n";
			       if(exists($hash_sc->{$m}))
				{
					if($r==0)
					{
                                $hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
					}
					else{ $hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*0.2;}
                                #$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
			if($hash->{$tmp}->{$m}<0.01){
					$hash->{$tmp}->{$m}=0.1;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
				}
				else
				{
					$hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
				}
                                $hash_mn->{$tmp}+=1;
				$r++;
                        }
			elsif($tmps=~/(.*?)\}/)
	
                        {
                        	my        $m=$1;
#  if($m eq "专")
  #if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件"))
  if(($m eq "专")||($m eq "功能词")||($m eq "人")||($m eq "事情")||($m eq "时间")||($m eq "特定")||($m eq "部件")||($m eq "地方"))

                        {next;}

			if(exists($hash_st->{$m}))
			{next;}
				if($r==0)
				{	
					  if(exists($hash_sc->{$m}))
                                	{
                                	$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
			if($hash->{$tmp}->{$m}<0.01){
					$hash->{$tmp}->{$m}=0.2;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
                                	}
                                	else
                        	        {
                	                        $hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
        	                        }

	                                $hash_mn->{$tmp}+=1;
				}
				else
				{
					

					  if(exists($hash_sc->{$m}))
                                	{
                                		$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*(0.5);
						if($hash->{$tmp}->{$m}<0.01){
                                        $hash->{$tmp}->{$m}=0.2;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
                                	}
                                	else
                        	        {
                	                        $hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
        	                        }
	
                                	$hash_mn->{$tmp}+=0.5;

				}

				$r++;
			}

                }

	
        }
}

open(FDb, "0babel_all_path_fin20.txt");
        my      $tmp=();
while(<FDb>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}

        $line=$_;
                                        $s=$line;
                                       # $m=$2;
                                         my $name;
                                        @a=split("\t",$s);
						my $dob=0;
                                        #@b=split(" ",$m);
                                        $f=0;
                                        foreach $i (@a)
                                        {
                                                if($f==0)
                                                {
                                                        $f++;
                                                        $name=$i;
							if(length($name)<3)
							{last;}
							if(exists($hash->{$name}))
							{
								#print "$name\n";
								$dob=1;
						#		last;
							}
							   if(exists($hash_st->{$name}))
					                        {last;}

                                                }
                                                else
                                                {
                                                        if($i=~/(.*?)\:(.*)/)
                                                        {
                                                                my $av=$1;
                                                                my $bv=$2;
                                                                if(exists($hash->{$name}->{$av}))
                                                                {next;}
								if($dob==1)
								{
                                                        $hash->{$name}->{$av}=$bv*0.3;
								}
								else
									{
									$hash->{$name}->{$av}=$bv;
									}
                                                        }
                                                }

                                        }
                                        #foreach $i (@b)
                                        #{
                                        #                $hash->{$name}->{$i}+=0.01;
                                        #}


}
close(FDb);

sub load_dic
{
open(FDb, "$_[0]");
my $lsize=$_[1];
my $zoomin=0.5;
if($_[2]>0)
{$zoomin=$_[2];}
        my      $tmp=();
while(<FDb>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}

        $line=$_;
                                        $s=$line;
                                       # $m=$2;
                                         my $name;
                                        @a=split("\t",$s);
                                                my $dob=0;
                                        #@b=split(" ",$m);
                                        $f=0;
					my $ccn=0;
                                        foreach $i (@a)
                                        {
                                                if($f==0)
                                                {
                                                        $f++;
                                                        $name=$i;
                                                        #if(exists($hash->{$name}))
                                                        #{
                                                                #print "$name\n";
                                                        #       $dob=1;
                                                #               last;
                                                        #}
                                                         #  if(exists($hash_st->{$name}))
                                                         #       {last;}

                                                }
                                                else
                                                {
                                                        if($i=~/(.*?)\:(.*)/)
                                                        {
                                                                my $av=$1;
                                                                my $bv=$2;
#覆盖
                                                           if(exists($hash_st->{$av}))
                                                                {next;}
#							if($ccn>$lsize)
#							{last;}
								$ccn++;	

                                                                        $hash->{$name}->{$av}=$bv*$zoomin;
					#			print "$name $av $bv\n";
                                                        }
                                                }

                                        }
                                        #foreach $i (@b)
                                        #{
                                        #                $hash->{$name}->{$i}+=0.01;
                                        #}


}
close (FDb);
print "load $_[0] ok\n";
}




close FD_log;

#load_dic("/home/htc/nord/recommend/babel_add.txt",30,0.5);
#load_dic("/home/htc/nord/recommend/名词.txt",20,0.5);
load_dic("/home/htc/base_total.txt",60,1);
#load_dic("/home/htc/nord/recommend/动词.txt",10,0.5);
#load_dic("/home/htc/nord/recommend/zw.txt",10,0.2);
#reloaddic();
#load_rewrite();
#	print Data::Dumper->Dump([%{$rewrite_g[1]}]);
	#print Data::Dumper->Dump([%{$hash}]);

#print "-----\n";
#	print Data::Dumper->Dump([%{$rewrite_all}]);

#print "-----\n";
@arr_my_hash2_val2;
@arr_my_hash2;
@arr_n_line;
@arr_my_hash2_seq;
@arr_og2;
@arr_in;
$arr_index = 0;
#open(FD_goods,"</home/htc/nord/recommend/goods25k.sctxt2");
#open(FD_goods,"</home/htc/nord/recommend/25k_sort_unique_qc_r");
open(FD_goods,"</home/htc/nord/recommend/25k_sort_unique_half_qc_r");
#open(FD_goods,"</home/htc/nord/recommend/25k_sort_unique_remove_qc_r");
#open(FD_goods,"</home/htc/nord/recommend/25k_sort_unique_last_qc_r");
while(<FD_goods>)
{
	my $in = $_;
	chomp($in);
	$in =~ /\d+\s(.*)/;
	push(@arr_in, $1);
	my $n_line;
#	$in = rewrite_gx($in);
	my ($my_hash2,$n_line,$my_hash2_seq,$og2)=txt2arr3($in,$exp_hash);
	my $tmp_val2 = 0.001;
	while( my ($k, $v)= each(%{$my_hash2}))
	{
		if($v > 0){
			$tmp_val2+=$v;
		}
	}
	push(@arr_my_hash2_val2, $tmp_val2);
	push(@arr_my_hash2, $my_hash2);
	push(@arr_n_line, $n_line);
	push(@arr_my_hash2_seq, $my_hash2_seq);
	push(@arr_og2, $og2);
	$arr_index++;
}
close(FD_goods);
@arr_last;
open(FD, "<25k_sort_unique");
while(<FD>)
{
	$line = $_;
	chomp($line);
#@strtmp = split("-", $line);
#	push(@arr_last, $strtmp[scalar(@strtmp)-1]);
	push(@arr_last, $line);
}
close(FD);
@arr_r_index;
@arr_r_index_len;
open(FD, "<25k_sort_unique_index");
#open(FD, "<25k_sort_unique_remove_index");
while(<FD>)
{
	my $in = $_;
	chomp($in);
	push(@arr_r_index, int($in));
}
close(FD);
open(FD, "<25k_sort_unique_index_len");
#open(FD, "<25k_sort_unique_remove_index_len");
while(<FD>)
{
	my $in = $_;
	chomp($in);
	push(@arr_r_index_len, int($in));
}
close(FD);
open(FD, "<25k_simple");
while(<FD>)
{
	my $in = $_;
	chomp($in);
	$in =~ s/\s+//g;
	if (!exists($hash_simple->{$in}))
	{
		$hash_simple->{$in}++;
	}
}
close(FD);
#%hash_customer;
print "server start\n";

my $n=0;
#our $f=0;
POE::Component::Server::TCP->new(
  Alias       => "echo_server",
#Port        => 11221,
  Port        => 21221,
  ClientInput => sub {

    my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
    #print "Session ", $session->ID(), " got input: ".$input."\n";
    #print "Session ", $session->ID(), " got input: ".length($input)."\n";
my $d;
my $d_seq;
open(FD_log,">/home/htc/nord/recommend/engine.log");
print FD_log "$input\n";
#system("cp $input /tmp/");
print "input file $input",length($input),"\n";
my @res;
if ($input=~/^reload/)
{
#@res=search_article2($input);
reloaddic();
#load_rewrite();
#my $psid=`ps aux|grep test_files2.py|grep -v grep|awk '{print \$2}'`;
#if($psid >0)
#{
#system("kill -9 $psid");
#print ("kill $psid\n");
#}
#system("./test_files2.py 1>/dev/null 2>/dev/null &");
#my $psid=`ps aux|grep test_files2.py|grep -v grep|awk '{print \$2}'`;
#print $psid,"\n";

#print FD_log "./test_files2.py 1>/dev/null 2>/dev/null &\n";
}
elsif($input=~/.* deep/)
{
#深度分析间接语义联系
	@res=deep_article($input);
}
elsif($input=~/(.*) path/)
{
	my $kkk=$1;
        my @res1=mean_path($input);
	
	my @res2=deep_article_path($kkk);
	@res=(@res1,@res2);
}
else
{
	#@res=deep_article($input);
#	@res=search_article($input);
#	print "\n I am seacrh++++++++++\n";
#@res_direct = search_direct($input);
#	$heap->{client}->put(@res_direct);
	@res = search_article_htc($input);
	print "\n I am search_htc++++++++++\n";
}
print FD_log @res;
close(FD_log);
#print scalar(@res)," num\n";
	$heap->{client}->put(@res);
	  #$session->yield("shutdown");
	  $_[KERNEL]->yield("shutdown");
             return;

},
  ClientDisconnected => sub {
           #print "Client disconnected\n"; # log it
         }
#, ClientFlushed => sub {
#           my $data_source = $_[HEAP]{file_handle};
#           my $read_count = sysread($data_source, my $buffer = "", 65536);
#           if ($read_count) {
#             $_[HEAP]{client}->put($buffer);
#           }
#           else {
#		print FD_log "------------------error\n";
#             $_[KERNEL]->yield("shutdown");
#           }
         #}

);
POE::Kernel->run;
sub search_direct
{
	$file = $_[0];
	chomp($file);
	open(FD_arr, "<$file");
	my $rand = rand();
	open(FD_aro, ">/home/htc/nord/recommend/$rand.sctxt");
	while(<FD_arr>)
	{
		my $in = $_;
		print FD_aro "$in";
		last;
	}
	close(FD_arr);
	close(FD_aro);
	system("./client_qc.pl /home/htc/nord/recommend/$rand.sctxt  /home/htc/nord/recommend/$rand.sctxt2 ");
	open(FD_scws,"</home/htc/nord/recommend/$rand.sctxt2");
	my @m_res_direct;
	while(<FD_scws>)
	{
		my $in = $_;
		chomp($in);
		@goods = split(/\s+/, $in);
		my $tmp66;
		foreach $j (@goods)
		{
			if (exists($hash_simple->{$j}))
			{
				#push(@m_res_direct, $j);
				$tmp66 = $tmp66.$j
			}
		}
		push(@m_res_direct, "http://search.jd.com/Search?keyword=".uri_escape($tmp66)."&enc=utf-8");
		last;
	}
	close(FD_scws);
	@m_res_direct;
}
#$file=$ARGV[0];
#$poe_kernel->run();
sub search_article_htc
{
	$file = $_[0];
	chomp($file);
	$exp_hash=();
	my $f = 0;
	my $n = 0;
	my $kk=0;
	my $thash = ();
	my @thash;
	my %my_hash;
	my %my_hash2;
	my %my_hash2_seq;
	my %thash;
	my $j;
	open(FD_arr, "<$file");
	my $first = 0;
	my $rand = rand();
	open(FD_aro, ">/home/htc/nord/recommend/$rand.sctxt");
	my $j = 1;
	my $inputa;
	my $inputb;
	my $customer;
	while(<FD_arr>)
	{
		my $in = $_;
		chomp($in);
=pod
		$customer = $in;
		if(exists($hash_customer->{$in})){
			print $in, "->",$hash_customer->{$in},"\n";
			return($hash_customer->{$in});
		}
=cut
		print FD_aro "0 $in";
		last;
	}
	close(FD_arr);
	close(FD_aro);
	system("./client_qc.pl /home/htc/nord/recommend/$rand.sctxt  /home/htc/nord/recommend/$rand.sctxt2 ");
	open(FD_scws,"</home/htc/nord/recommend/$rand.sctxt2");
	my $disthash;
	my @disthash;
	my $dist_seq_hash;
	my @dist_seq_hash;
	while(<FD_scws>)
	{
		my $in = $_;
		chomp($in);
		($d,$c,$d_seq,$og) = txt2arr_input($in,$exp_hash);
		$disthash[$kk]=clone($d);
#		$disthashog[$kk]=clone($og);
#		$dist_seq_hash[$kk]=clone($d_seq);
					my $nnn=$kk;
		$kk++;
		my $total = 0;
		my $index;
#		while($total < 5000)
#			my $i_index = (int(rand(25000)) + 1);
#			if ( !exists($index->{$i_index}) )
#			{
#				$index->{$i_index} = $i_index;
#				$total++;
					my $nsum=0;
					my $ntotal=0;
					my $most=0;
=pod
					for (0 .. 24760)
					{
						my $i_index=$_;
						my $offset= int(rand(5));
						$i_index+=$offset;			
					if($i_index>24760){next;}
						#$index->{$i_index} = $i_index;

					$n=comp_hash($disthash[$nnn],$arr_my_hash2[$i_index],$dist_seq_hash[$nnn],$arr_my_hash2_seq[$i_index],$disthashog[$nnn],$arr_og2[$i_index]);
						if($n>0.3)
						{
						my $shnn=$nnn+1;
						$thash[$nnn]->{$arr_n_line[$i_index]} = $n;
						last;
						}
						elsif($n>0.09)
						{
							if ($n>$most)
						{	$thash[$nnn]->{$arr_n_line[$i_index]} = $n;}
						}
					}
=cut
#					for (0 .. 24760)
					for (0 .. 5189)
#					for (0 .. 3996)
					{
						my $i_index=$arr_r_index[$_]-1;
						my $offset= int(rand($arr_r_index_len[$_]));
						$i_index += $offset;
					$n=comp_hash($disthash[$nnn],$arr_my_hash2[$i_index],$dist_seq_hash[$nnn],$arr_my_hash2_seq[$i_index],$disthashog[$nnn],$arr_og2[$i_index], $arr_my_hash2_val2[$i_index]);
					if($n > 0.09){	
					$thash[$nnn]->{$arr_n_line[$i_index]} = $n;
					}
=pod
						if($n>0.3)
						{
						my $shnn=$nnn+1;
						$thash[$nnn]->{$arr_n_line[$i_index]} = $n;
						last;
						}
						elsif($n>0.09)
						{
							if ($n>$most)
						{	$thash[$nnn]->{$arr_n_line[$i_index]} = $n;}
						}
=cut
					}
#			}
#		}
	}
	close(FD_scws);
	my @m_res;
	for(0 .. (scalar(@disthash)-1))
	{
		my $nnn=$_;
		my @key =sort {$thash[$nnn]->{$b} <=> $thash[$nnn]->{$a}} keys      %{$thash[$nnn]};
		@tmpstr = split("-", $arr_last[$key[0]-1]);
		$tmp55 = $tmpstr[scalar(@tmpstr)-1]; 
		$tmp55 =~ s/\s//g;
		push(@m_res, "$arr_last[$key[0]-1]\n"."http://search.jd.com/Search?keyword=".uri_escape($tmp55)."&enc=utf-8");
#$hash_customer->{$customer} = $m_res[0];
		last;
=pod
		my $o_n;
		foreach $i (@key)
		{
			my $shnnn=$nnn+1;
			push (@m_res,"$arr_in[$i-1]-$shnnn-$i-".$thash[$nnn]->{$i});
		}
=cut
	}
	@m_res;	
}

sub  comp_hash
{
my	$h1=@_[0];
my	$h2=@_[1];
my	$h3=@_[2];
my	$h4=@_[3];
my	$org1=@_[4];
my	$org2=@_[5];
my  $h2_val2 = @_[6];

my	$akv=0;
	my $all;
#	my $an_all;
	my %all;
	my $vall=0.001;
	my $vall2=0.001;
my $k,$v;
	    while(($k,$v)=each(%{$h1}))
                                {
				$all->{$k}++;
				if($v>0)
				{
#	$an_all++;
				$vall+=$v;
				}
				
#                                       print FD_log " hash 1 k $k v  $v\n";
                               }
my $k,$v;
	  while(($k,$v)=each(%{$h2}))
                                {
				$all->{$k}++;
#				  if($v>0)
#                                {
#				$an_all++;

		
				#$vall2++;
#				$vall2+=$v;
#				}
 #                                     print FD_log " hash 2 k $k v $v\n";
                               }
					
my $k,$v;
#my $samelist;
#my $std_n;
		
	 while(($k,$v)=each(%{$all}))
                                {
					if($v >= 2)
					{

						# my $sum;
                                                #if($h1->{$k}>$h2->{$k})
                                                #{$sum=$h2->{$k};}
                                                #else
                                                #{$sum=$h1->{$k};}
                                                #$kv=$sum/($vall+$vall2);
						if($h1->{$k}<0)
						{next;}
#						$an_all=$an_all-2;
                                        # 平均算法 适用于多词找文   对词典中噪音有包容性   #$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
                                                #$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
#	$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
#                                                $kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
#                                                $kv3=($kv+$kv2)/2;
						$kv=($h1->{$k}+$h2->{$k})/($vall+$h2_val2);
                                                $kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$h2_val2))/2;
                                                $kv3=($kv+$kv2)/2;
                                        # log2算法 适用于文找文     $kv=($h1->{$k}+$h2->{$k})/($vall+$vall2+log2($an_all));
						#$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2+log2($an_all)/2);
						#$kv3=$kv;
                                                #$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
                                                #if($h1->{$k}<$h2->{$k})
                                                #{

                                                #}
                                                $akv+=$kv3;
#                                                print  FD_log "double $k $v kv sum= $h1->{$k}+$h2->{$k} ($org1->{$k}, $org2->{$k}) kv= $kv  val1= $vall  val2= $vall2 kv2=$kv2   kv3 =$kv3 std_n=$std_n\n";
#						print FD_log "相似语义 $k 来自前句 from ";
						#while(($kc,$vc)=each($org1->{$k}))
						#{
						#	print	"$kc,";	
						#}
						#print " 来自后句 ";
=pod												
						if (exists($org2->{$k}))
						{
						    while(($kc,$vc)=each($org2->{$k}))
                                                {
                                                        #print   "$kc,";
                                                }
					#print "\n";
						
						}
						$samelist->{$k}=$h3->{$k};

						$std_n++;
=cut
					}
                               }
#序列相似评估


#-----------------------------------------------
				
			#	异议评估 说多了远的,深意不在这里
					#	my $maxd=log2($an_all+1);
					#	if($maxd>100)
					#	{$akv=$akv*0.618;}
					#	elsif($maxd==0)
					#	{
							
					#	}
					#	elsif($maxd>1)
					#	{	
					#	$akv=$akv*(0.8+0.2/($maxd+1));
					#	}
					#	print "---------------\n";


	$akv;

}

sub ndate
{
        ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
       $year=~s/1(.*)/20$1/;
       $mon+=1;
       if($mon=~/^\d$/)
       {
               $mon="0"."$mon";
       }
       if($mday=~/^\d$/)
       {
               $mday="0"."$mday";
       }
               if($hour=~/^\d$/)
       {
               $hour="0"."$hour";
       }

               if($min=~/^\d$/)
       {
               $min="0"."$min";
       }
               if($sec=~/^\d$/)
       {
               $sec="0"."$sec";
       }




my       $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
        $res;

}

sub txt2arr_one
{
my        $w=$_[0];
	
my        $exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;

	my @w_arr=split(",",$w);
	
	foreach my $w_tmp (@w_arr)
	{
			if($w_tmp=~/(.*?) (.*)/)
			{
			$w=$1;
			$vh=$2;
			#print "$w -- $vh\n";
                        if(exists($hash->{$w}))
                        {
                                my $t_n=0;

                                while(($k,$v)=each(%{$hash->{$w}}))
                                {
					if($v==0)
					{
						$v=1;
					}
                                        $tmp_hash->{$k}=$vh;
                                }
                        }
				$tmp_hash->{$w}=$w;

			}
			else
			{
				print "input error\n";
			}
	}
                 #       while(($k,$v)=each(%{$tmp_hash}))
#
 #                               {
                                        #print "meaning  $k  $v\n";
  #                              }
#	}
        return (%{$tmp_hash});
}
sub txt2arr_input
{
my      $input_t=$_[0];
my      $exp_h=$_[1];
my $tmp_hash=();
my $tmp_sequ;
my $sequ=0;
my @list=split(" ",$input_t);
print FD_log  "$input_t\n";
@res=();
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $tmp_first=0;
my $og;











my $linst_len;
$list_len=scalar(@list);
for (0 .. ($list_len-1))
#foreach $tmp_res (@list)
{
	
		$nown=$_;
		
		$tmp_res=$list[$_];
                if ($tmp_first==0)
                {
                        $nline=$tmp_res;
                        $tmp_first++;
                        next;
                }

                        my $l=length($tmp_res);
                                if(($l<3)&&($l!~/^[a-z|A-Z]/))
                               {next;}
if(exists($hash_st->{$tmp_res}))
                        {next;}

		if($tmp_res=~/^\d/)
		{next;}
		if($tmp_res=~/^-/)
                {next;}
#原始词保留
#主题放大
my $zs=1;
#-----------------------------------------------------------------------
                                $w=$tmp_res;
				if(($list[$nown+1]=~/^\d/)&&(length($input_t)<10))
				{
                                $tmp_hash->{$w}+=2*$list[$nown+1];
				$zs=$list[$nown+1];
				}
				elsif(($list[$nown+1]=~/^-/))
                                {
                                $tmp_hash->{$w}-=2*$list[$nown+2];
                                $zs=-$list[$nown+2];
                                }
				else
				{
#控制重复词语数量
					if($tmp_hash->{$w}>($list_len/10))
	                                {next;}
	                                else
	                                {$tmp_hash->{$w}+=2;
						$og->{$w}->{$w}=1;
					}

					$tmp_sequ->{$w}=$sequ;
					$sequ++;
				}
#----------------------------------------------------------
##序列



                                 if(exists($hash->{$w}))
                        	{
                                my $t_n=0;

                                while(($k,$v)=each(%{$hash->{$w}}))
                                {
                                               $tmp_hash->{$k}+=$v*$zs;
						$og->{$k}->{$w}=1;
                                               $tmp_sequ->{$k}=$sequ;
#                                                print FD_log  "正常词 $w  转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"k=$k sequ=$sequ\n";
                                }
                        }
                        else
                        {
                                                $tmp_hash->{$w}+=0.1;
						$og->{$w}->{$w}=1;
                                               $tmp_sequ->{$k}=$sequ;
#                                                print FD_log  "正常词 $w  转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"k=$k  sequ=$sequ\n";

                        }



}

        return (\%{$tmp_hash},$nline,\%{$tmp_sequ},\%{$og});
}


sub txt2arr3
{
#需要改进为 先判断文本域 再根据域  重写词向量值  域向量集 可以用clone覆盖已有向量
#例如黑话库 的词依存 规则
my      $input_t=$_[0];
my      $exp_h=$_[1];
my $tmp_hash=();
my $tmp_hash_seq=();
my @list=split(" ",$input_t);
print FD_log  "$input_t\n";
@res=();
my $og;
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $seq=0;
my $tmp_first=0;
my $list_len;
$list_len=scalar(@list);
foreach $tmp_res (@list)
{
                if ($tmp_first==0)
                {
                        $nline=$tmp_res;
                        $tmp_first++;
                        next;
                }

                        my $l=length($tmp_res);
                                #if($l<=3)
                                #{next;}
                                if(($l<3)&&($l!~/^[a-z|A-Z]/))
                                {next;}
			if(exists($hash_st->{$tmp_res}))
                        {next;}

#原始词保留 
#主题放大
				$w=$tmp_res;
				if($tmp_hash->{$w}>($list_len/10))	
				{next;}
				else
				{$tmp_hash->{$w}+=2;
					$og->{$w}->{$w}=1;
				}
				$tmp_hash_seq->{$w}=$seq;
				$seq++;
				 if(exists($hash->{$w}))
                        {
                                my $t_n=0;

                                while(($k,$v)=each(%{$hash->{$w}}))
                                {
						$tmp_hash->{$k}+=$v;	
						$tmp_hash_seq->{$k}=$seq;
						$og->{$k}->{$w}=1;
#                                                print FD_log  "正常词 $w  转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"   $k seq=$seq\n";
				}
			}
			else
			{
						$tmp_hash->{$w}+=0.1;	
						$og->{$w}->{$w}=1;
						$tmp_hash_seq->{$w}=$seq;
#                                                print FD_log  "正常词 $w  转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"   $k seq=$seq\n";

			}



}
                                                              
        return (\%{$tmp_hash},$nline,\%{$tmp_hash_seq}, \%{$og});
}

sub log2 {
 my $n = shift;
 return log($n)/log(2);
}



#4——9支持 stop.txt 其中加了若干标点
# jieba分词 分割文件  bug修正
# jieba分词 实现多socket的全内存调用 待高并发测试
# 添加了client_qc.pl用来支持 分类引擎与切词引擎的数据交换
#添加了 若干词到 词典文件
#计划加入 每条 目标与待测文本 进行域向量修订(共现放缩) 域词典 数量为N
#4-12	修正词典中的标点,和去掉一些单字
#测试自然对数 作为衡量相似度的
# 4-14修正  reload 命令同时重启切词系统
#4-20  comp_hash加入相似特征的序列对比 当序列相近则认为相似度更近
#加入同义词词典
#4-23 单字支持做调整 修正了停用词表的bug 修正了文本中有----的bug

#4-29 修正输入 文章中有数字影响 权重的bug
#4-30加入post数据接口 加入 共现词组转义rewrite函数
#添加重复词语数量控制 为20个词中 最多计数一个
#添加了 各种品牌
#向量空间减少到60个
#加入目标语句的深度含义分析
#计划加入 实体路径的最大关联分析


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.