Yet Another CPAN Grep

zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/engine_k5.pl

#!/usr/bin/perl
use Algorithm::Kmeanspp;
#use JSON::XS;
 use Clone qw(clone);



use MongoDB;
use Smart::Comments;
use lib "/home/wuyabo/shell/";
use Conn_mongo_jc;
use Data::Dumper;
use IO::File;
#use Add_info;

open(FDst,"stop.txt");
our $hash_st;
while(<FDst>)
{
	my $line;
	$line=$_;
	chomp($line);	
		$hash_st->{$line}=1;

}

# http://poe.perl.org/?POE_Cookbook/TCP_Servers


# Include POE and POE::Component::Server::TCP.
 our $hash=();
 $hash_mn=();


my $hash_sc=();
open(FD_sc, "dict.utf8.txt");
#open(FD_log, ">/tmp/cluster_.txt");

while(<FD_sc>)
{
#招收学员        13.87   8.50    n
	my $line=$_;
	if($line=~/(.*?)\t(.*?)\t(.*?)\t(.*)/)
	{
		$tt1=$1;
		$tt2=$2;
		$tt3=$3;
		$tt4=$4;
		$hash_sc->{$tt1}->{"i"}=$tt2;
		$hash_sc->{$tt1}->{"h"}=$tt3;
		$hash_sc->{$tt1}->{"a"}=$tt4;

	}
}
open(FD, "word.txt");
	my	$tmp=();
while(<FD>)
{
#DEF={Unit|单位:host={information|信息:belong={computer|电脑}}}
#DEF={tool|用具:modifier={able|能:scope={bring|携带:content={$}}},{listen|听:content={music|音乐},instrument={~}}}

        $line=$_;
        chomp($line);
	
        if($line=~/^W_C=(.*)/)
        {
                $tmp=$1;
                #print $tmp,"\n";
                $r=1;

        }
        elsif($line=~/^DEF=(.*)/)
        {
                $r=0;
                $tmp_m=$1;
               # print $tmp_m,"\n";
               my  @arry=split('\|',$tmp_m);
                foreach $tmps (@arry)
                {
               #        print "-----------------$tmps ------------\n";
                        if($tmps=~/(.*?)\:/)
                        {
                        my        $m=$1;
			if(exists($hash_st->{$m}))
			{next;}
		if($m eq "专")
		{next;}
                #                print "$tmp - $m-----\n";
			       if(exists($hash_sc->{$m}))
				{
					if($r==0)
					{
                                $hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
					}
					else{ $hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*0.2;}
                                #$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
			if($hash->{$tmp}->{$m}<0.01){
					$hash->{$tmp}->{$m}=0.1;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
				}
				else
				{
					$hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
				}
                                $hash_mn->{$tmp}+=1;
				$r++;
                        }
			elsif($tmps=~/(.*?)\}/)
	
                        {
                        	my        $m=$1;
  if($m eq "专")
                        {next;}

			if(exists($hash_st->{$m}))
			{next;}
				if($r==0)
				{	
					  if(exists($hash_sc->{$m}))
                                	{
                                	$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"};
			if($hash->{$tmp}->{$m}<0.01){
					$hash->{$tmp}->{$m}=0.2;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
                                	}
                                	else
                        	        {
                	                        $hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
        	                        }

	                                $hash_mn->{$tmp}+=1;
				}
				else
				{
					

					  if(exists($hash_sc->{$m}))
                                	{
                                		$hash->{$tmp}->{$m}=$hash_sc->{$m}->{"h"}*(0.5);
						if($hash->{$tmp}->{$m}<0.01){
                                        $hash->{$tmp}->{$m}=0.2;}
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
                                	}
                                	else
                        	        {
                	                        $hash->{$tmp}->{$m}=1;
#	print FD_log $hash->{$tmp}->{$m}," $m\n";
        	                        }
	
                                	$hash_mn->{$tmp}+=0.5;

				}

				$r++;
			}

                }

	
        }
}
#i#close FD_log;
print "server start\n";
use POE qw(Component::Server::TCP);

my $n=0;
#our $f=0;
POE::Component::Server::TCP->new(
  Alias       => "echo_server",
  Port        => 11214,
  ClientInput => sub {

    my ($session, $heap, $input) = @_[SESSION, HEAP, ARG0];
    #print "Session ", $session->ID(), " got input: ".$input."\n";
    #print "Session ", $session->ID(), " got input: ".length($input)."\n";
my $d;
#open(FD_log,">/tmp/cluster.log");
#print FD_log "$input\n";
#system("cp $input /tmp/");
print "input file $input\n";
my @res;
if($input=~/(.*?)\ gra/)
{

	@res=search_article2($1);	
}
else{
	@res=search_article($input);	
	}
#print FD_log @res;
#close(FD_log);
#print scalar(@res)," num\n";
    $heap->{client}->put(@res);
	  #$session->yield("shutdown");
	  $_[KERNEL]->yield("shutdown");
             return;

},
  ClientDisconnected => sub {
           #print "Client disconnected\n"; # log it
         }
#, ClientFlushed => sub {
#           my $data_source = $_[HEAP]{file_handle};
#           my $read_count = sysread($data_source, my $buffer = "", 65536);
#           if ($read_count) {
#             $_[HEAP]{client}->put($buffer);
#           }
#           else {
#		print FD_log "------------------error\n";
#             $_[KERNEL]->yield("shutdown");
#           }
         #}

);
POE::Kernel->run;

#$input="坦克,我的希望非常诱人电视里有主持人，那里有大熊猫";
#	$input="网络和网民的意见，现在正行驶在通州回北京的高速公路上，十评论员单仁平的文章，标题是“做大众政治焦点，茅于轼的选择”。这篇文章的核心意见是，茅于轼应该做中国社会团结的促进者，不应该";
#$input="致力于宣传 市场万能 剥削有理 汉奸人性 保钓无用 保粮错误替富人说话的茅于轼，今天下午两点在北京海淀翠宫饭店演讲顽强继续。未知海淀区委书记隋振江，宣传部长陈名杰是否到场。外媒问我是否到场，告曰:先参加央视《苦难辉煌》座谈会，或会晚到一会儿。今天要长见识了";
#	$input2="茅于轼刚刚吃完胡辣汤和烧饼夹猪头肉。现在正行驶在通州回北京的高速公路上，十五分钟以后要开始腾讯微访谈直播，我和方舟子拟就昨天的热点问题，回答网友的提问。敬请各位网友提示一下，昨天有什么热点问题需要谈一谈？";
#$file=$ARGV[0];
sub search_article
{
$file=$_[0];
chomp($file);
 my $num_cluster = 5;
my $zoom;
my $zhash;
if($file=~/(.*?) (\d+?) (.*)/)
{
$file=$1; $num_cluster =$2;$zoom=$3;
}
elsif($file=~/(.*?) (\d+)/)
{$file=$1; $num_cluster =$2;}
print "$file $num_cluster zoom $zoom\n";
#print "file=$file\n";
my @zlist;
@zlist=split(" ",$zoom);
foreach my $zw (@zlist)
{
my	$inzw="0 ".$zw;
	 my ($zash,$tt_line)=txt2arr($inzw);
                #$hashleft[$n_line]=clone(\$my_hash2);
	my $k,$v;
		while(($k,$v)=each(%{$zash}))
		{
			$zhash->{$k}=$v;	
		}
}
		print Data::Dumper->Dump([$zhash]);

my $f=0;
my $n=0;
my $kk;
my $thash=();
if(! -e "$file")
{return "file is't exist\n"};
my @input_ar=`cat $file`;
if(scalar(@input_ar)<=$num_cluster)

{
		return "input article must more then class number\n";
	
}
my %my_hash;
my %my_hash2;
my %thash;
my $j;

#my $rand2=rand();
#open(FD_hash,">$file.txt");
#print "openfile $file\n";
open(FD_arr,"<$file");
my $first=0;
my $rand=rand();
open(FD_aro,">/tmp/$rand.sctxt");
my $j=1;














while(<FD_arr>)
{
#print FD_log $j,"----------\n";
#print $j,"----------\n";

	my $in=$_;
	chomp($in);
#		if(length($in)>500)
#		{	$in=substr $in,0,500;$in.=".";}
#		$in=~s/\@.*?\:/ /g;
#		$in=~s/\@.*?\s/ /g;
#		$in=~s/机场|航班|深圳/ /g;
		 print FD_aro "$j ",$in,"\n";
	$j++;
			
#print FD_log $j,"-p----",length($in),"\n";
#print $j,"-p----",length($in),"\n";

	
	
}

close FD_arr;
close FD_aro;
system("/home/wuyabo/xs/bin/scws -I  -E -N -d /home/lzj/shell/dict.xdb -c utf-8   /tmp/$rand.sctxt -o /tmp/$rand.sctxt2");
unlink("/tmp/$rand.sctxt");
#print "/tmp/$rand.sctxt\n";
open(FD_scws,"/tmp/$rand.sctxt2");
my @hashleft;
my $hashleft;
my $alln=1;
my $sy_list;
#         my $kmp = Algorithm::Kmeanspp->new;
my $itn=0;
my $sy_list;
while(<FD_scws>)
#foreach my $in (@output)
{
	my $in=$_;
print FD_log "line ttt $kk $in \n";
	chomp($in);
	#$input_ar[$f]=$in;

		my $n_line;
		my ($my_hash2,$n_line)=txt2arr($in,$zhash);
                #$hashleft[$n_line]=clone(\$my_hash2);
                $hashleft[$n_line]=clone($my_hash2);
#		print Data::Dumper->Dump([$my_hash2]);
		#print Data::Dumper->Dump([$hashleft[$n_line]]);
my $k,$v;
	while(($k,$v)=each(%{$my_hash2}))
	{
		if(exists($sy_list->{$k}))
		{$sy_list->{$k}++;}
		else
		{$itn++;$sy_list->{$k}=1;}
	#	$hashleft[$n_line]->{$k}=$v;
	}

	$alln++;
}
my $itnn=$itn;
		open (FD,">/tmp/$rand.ck");
		for(1 .. scalar(@hashleft))
	{
		my $ttn=$_;
		my $k,$v;
			my $inp;
			print FD "$ttn";
	        while(($k,$v)=each(%{$hashleft[$ttn]}))
	        {
			if($sy_list->{$k}<2)
			{
		#		$hashleft[$ttn]->{$k}=0;
#				print "remove $k\n";
				#delete($hashleft[$ttn]->{$k});
				#delete($hashleft[$ttn]->{$k});
		#	print FD "\t".$k."\t".$hashleft[$ttn]->{$k};	
								
		#		$itnn--;	
			}
			else{
#				print "has $k\n";
		#		$inp->{$k}=$hashleft[$ttn]->{$k}*10;
			print FD "\t".$k."\t".$hashleft[$ttn]->{$k};	
					}
		}
		print FD "\n";
		
             #$kmp->add_document($ttn, $inp);
	}
close FD;

         my $num_iter    = $itnn;
      #   $kmp->do_clustering($num_cluster, $num_iter);
print "bayon -n $num_cluster -c /tmp/$rand.ckc /tmp/$rand.ck ";
my @m_ress=`bayon -n $num_cluster -c /tmp/$rand.ckc /tmp/$rand.ck `;
my @m_res;
foreach my $mss (@m_ress)
{
	$mss=~/(.*?)\t(.*)/;
	my $m2=$2;
	push (@m_res,$m2);
}

push @m_res,"----------------\n";


my $cenhash;
my @mm_res=`cat  /tmp/$rand.ckc`;
foreach my $ttmm (@mm_res)
{
        my @sttm=split("\t",$ttmm);
	foreach my $tsttm(@sttm)
	{
		$cenhash->{$tsttm}++;	
	}

}
foreach my $ttmm (@mm_res)
{
	my @sttm=split("\t",$ttmm);
#delete hot word from the result;
#	for( 0 .. 6 )
#	{
#		my $tna=$_;
#		if(exists($zhash->{$sttm[$tna]}))
#		{
#			delete($sttm[$tna]);
#			delete($sttm[$tna+1]);
#		}
#	}
	my $hn=0;
my $stt;
	for(0 .. (scalar(@sttm)-1))
	{
		my $nown=$_;
		if($sttm[$nown]=~/\d/)
		{next;}
		if(($hn==0)&&($cenhash->{$sttm[$nown]} <= ($num_cluster/2) ))
		{
 $stt=$sttm[$nown]."-".$sttm[$nown];
		$hn++;
		}
		elsif(($hn <4)&&($cenhash->{$sttm[$nown]} <= ($num_cluster/2) ))
		{
		 $stt=$stt."\t".$sttm[$nown]."-".$sttm[$nown];
                $hn++;
                }

	}
#	my $stt=$sttm[1]."-".$sttm[2]."\t".$sttm[3]."-".$sttm[4]."\t".$sttm[5]."-".$sttm[6]."\t".$sttm[7]."-".$sttm[8];

	
push (@m_res,$stt);
}
#my @key =sort {$cluster->{$b} <=> $cluster->{$a}} keys %{$cluster};
         #    print "$itn $itnn\n";

unlink("/tmp/$rand.sctxt2");
		close(FD_arr);
	@m_res;
}


#	}
#	);
#$poe_kernel->run();

sub txt2arr
{
my	$input_t=$_[0];
my	$exp_h=$_[1];
my $tmp_hash=();
#		chomp($input_t);
my @list=split(" ",$input_t);
#print "txt2arr input =$input_t\n";
#$rand=rand();
#open(FD_r,">/tmp/$rand");
#print FD_log  "$input_t\n"; 
#close(FD_r);
#$dir="/home/wuyabo/stanford-parser-2012-07-09";
@res=();
#@res=`$dir/parser-10.pl /tmp/$rand 0 50`;
#$part=0;
#unlink("/tmp/$rand");
#print @res;
my $n_a=0;
my $n_b=0;
my $v_a=0;
my $v_b=0;
my $n1;
my $n2;
my $v1;
my $v2;
my $nline;
my $tmp_first=0;
foreach $tmp_res (@list)
{
		if ($tmp_first==0)
		{
			$nline=$tmp_res;	
			$tmp_first++;
			next;
		}
		
		if(exists($hash_st->{$tmp_res}))
		{next;}
			my $l=length($tmp_res);
#		if($l<=3)
#		{
				#if($hash_sc->{$tmp_res}->{'a'} !~/n|v|a/)	
				#{
				#print FD_log "seek $tmp_res\n";
				#next;}
				if($l<=3)
				{next;}
#主题放大
			if(exists($hash_sc->{$tmp_res}))
			{
					if($hash_sc->{$tmp_res}->{'a'}=~/n/)									
					{
						if($hash_sc->{$tmp_res}->{'h'}> $n_a)
						{
							$n_a=$hash_sc->{$tmp_res}->{'h'};
								$n1=$tmp_res;
				#	print FD_log  "放大主题-$tmp_res-","$n_a n\n";
								if($n_a >$n_b)
								{
									my	$swtmpn=$n_b;
									my	$swtmpnw=$n2;
										$n_b=$n_a;$n2=$n1;
										$n_a=$swtmpn;$n1=$swtmpnw;
								}
						}
						
					}
					elsif($hash_sc->{$tmp_res}->{'a'}=~/v/)
					{
						if($hash_sc->{$tmp_res}->{'h'}> $v_a)
                                                {
                                                        $v_a=$hash_sc->{$tmp_res}->{'h'};
                                                                $v1=$tmp_res;
					print FD_log  "放大主题-$tmp_res-","$v_a v \n";
                                                                if($v_a >$v_b)
                                                                {
                                                                        my      $swtmpn=$v_b;
                                                                        my      $swtmpnw=$v2;
                                                                                $v_b=$v_a;$v2=$v1;
                                                                                $v_a=$swtmpn;$v1=$swtmpnw;
                                                                }
                                                }
					}
			}

			#	}
#		}
	#	print $tmp_res,"---",length($tmp_res),"\n";
		
		#if (($part==1)&&($tmp_res=~/(.*?),(.*)/))
		#{
			#print "切词1=$1,2=$2\n";
			my $w=$tmp_res;
#			my $value=$2;
		#	if(exists($exp_h->{$w}))
		#	{
			#	print "$w\n";
		#	next;}
			if(exists($hash->{$w}))
			{
				my $t_n=0;

				while(($k,$v)=each($hash->{$w}))
				{
					if(exists($hash_st->{$k}))
       				         {next;}

					
						$tmp_hash->{$k}+=0.1;
					#$tmp_hash->{$k}=$v;
					#$tmp_hash->{$k}=$hash_sc->{$w}->{'h'};
                                                #print FD_log  "正常词 $w  转换",$hash_sc->{$w}->{'h'},"--",$tmp_hash->{$k},"   $k\n";
							       #正常词 交  转换0.00--0.00   相互
					#if(exists($sytmp->{$k}))
					#{
					#	print "归一 $k ",$sytmp->{$k},"\n";
				#		$k=$sytmp->{$k};
				#}
#						if($value>20)
		# $heap->{client}->put("$k $v\n");
				}
					#$tmp_hash->{$k}=$v;
			}
			else
			{
				if(exists($hash_sc->{$w}))
				{
                                                $tmp_hash->{$w}+=0.2;
	
				 #$tmp_hash->{$w}=$hash_sc->{$w}->{'h'};
				}
				else
				{

				 	$tmp_hash->{$w}=0.1;
				}
				
				#		print FD_log "no exists wordnet 切词 k=$w,v=",$hash_sc->{$w}->{'h'},"\n";
			}


}
#是否采用scws放大文章中心词
			while(($k,$v)=each(%{$tmp_hash}))
                                {
						if(($k eq $n1)||($k eq $n2)||($k eq $v1)||($k eq $v2))
						{$tmp_hash->{$k}=$v*2;
						}
						if(exists($exp_h->{$k}))
                                        {
                                                 $tmp_hash->{$k}+=0.8;
#					print  "自定义放大主题-$k-",$tmp_hash->{$k}," \n";
                                        }

				}
		#print Data::Dumper->Dump([%tmp_hash]);
	#return (%tmp_hash,$nline);
	return (\%{$tmp_hash},$nline);
}				

#print "output = $res\n";
#		 $heap->{client}->stop();
#	$session->stop();


# Start the server.

sub  comp_hash
{
my	$h1=@_[0];
my	$h2=@_[1];
my	$akv=0;
	my $all;
	my %all;
	my $vall=0.01;
	my $vall2=0.01;
my $k,$v;
	    while(($k,$v)=each(%{$h1}))
                                {
				$all->{$k}++;
				$vall+=$v;
 #                                      print FD_log " hash 1$k  $v\n";
                               }
my $k,$v;
	  while(($k,$v)=each(%{$h2}))
                                {
				$all->{$k}++;
		
#                                      print FD_log " hash 2$k  $v\n";
				$vall2+=$v;
                               }
my $k,$v;
	 while(($k,$v)=each(%{$all}))
                                {
					if($v >= 2)
					{
						$kv=($h1->{$k}+$h2->{$k})/($vall+$vall2);
						$kv2=((($h1->{$k})/$vall) + ($h2->{$k}/$vall2))/2;
						$kv3=($kv+$kv2)/2;
						#if($h1->{$k}<$h2->{$k})
						#{
								
						#}
#						print  FD_log "double $k $v kv $kv kv2 $kv2\n";
						$akv+=$kv3;
					}
                               }
					#	print "---------------\n";


	$akv;

}

sub ndate
{
        ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time());
       $year=~s/1(.*)/20$1/;
       $mon+=1;
       if($mon=~/^\d$/)
       {
               $mon="0"."$mon";
       }
       if($mday=~/^\d$/)
       {
               $mday="0"."$mday";
       }
               if($hour=~/^\d$/)
       {
               $hour="0"."$hour";
       }

               if($min=~/^\d$/)
       {
               $min="0"."$min";
       }
               if($sec=~/^\d$/)
       {
               $sec="0"."$sec";
       }




my       $res=$year."-".$mon."-".$mday." ".$hour.":"."$min".":"."$sec";
#print $res,"\n";
        $res;

}

sub txt2arr_one
{
my        $w=$_[0];
	
my        $exp_h=$_[1];
$tmp_hash=();
$rand=rand();
@res=();
$part=0;
#print @res;

	my @w_arr=split(",",$w);
	
	foreach my $w_tmp (@w_arr)
	{
			if($w_tmp=~/(.*?) (.*)/)
			{
			$w=$1;
			$vh=$2;
			#print "$w -- $vh\n";
                        if(exists($hash->{$w}))
                        {
                                my $t_n=0;

                                while(($k,$v)=each($hash->{$w}))
                                {
					if($v==0)
					{
						$v=1;
					}
                                        $tmp_hash->{$k}=$vh;
                                }
                        }
				$tmp_hash->{$w}=$w;

			}
			else
			{
				print "input error\n";
			}
	}
                 #       while(($k,$v)=each(%{$tmp_hash}))
#
 #                               {
                                        #print "meaning  $k  $v\n";
  #                              }
#	}
        return (%{$tmp_hash});
}

sub search_article2
{
$file=$_[0];
chomp($file);
#print "file=$file\n";
$expword=$ARGV[1];
$exp_hash=();
@exp_w=split(",",$expword);
foreach $exp (@exp_w)
{
$exp_hash->{$exp}=1;
}
my $f=0;
my $n=0;
my $kk;
my $thash=();
my @input_ar=`cat $file`;
my %my_hash;
my %my_hash2;
my %thash;
my $j;
open(FD_arr,"<$file");
my $first=0;
my $rand=rand();
open(FD_aro,">/tmp/$rand.sctxt");
my $j=1;
while(<FD_arr>)
{
        my $in=$_;
        chomp($in);
                 print FD_aro "$j ",$in,"\n";
        $j++;
}
close FD_arr;
close FD_aro;
system("/home/wuyabo/xs/bin/scws -I  -E -N -d /home/wuyabo/xs/etc/dict.utf8.xdb:/home/wuyabo/stanford/dict_user.txt -c utf-8   /tmp/$rand.sctxt -o /tmp/$rand.sctxt2");
unlink("/tmp/$rand.sctxt");
#print "/tmp/$rand.sctxt\n";
open(FD_scws,"/tmp/$rand.sctxt2");
my $hashleft;
my $alln=1;
while(<FD_scws>)
{
        my $in=$_;
#print FD_log "line ttt $kk $in \n";
        chomp($in);
        #$input_ar[$f]=$in;

                my $n_line;
                my ($my_hash2,$n_line)=txt2arr($in,$exp_hash);
                $hashleft[$n_line]=clone($my_hash2);
        $alln++;
}
my $j=0;
my $cluster,$cln;
             for( 1 .. $alln-1)
                {
                        my $nn=$_;
                        for ($nn .. $alln-1)
                        {
                                $nn2=$_;
                                if($nn==$nn2)
                                {next;}
                                $n=comp_hash($hashleft[$nn],$hashleft[$nn2]);
                                $n=$n * 100;
                             #   print "$n $nn $nn2 \n";
                                $cluster->{"$nn-$nn2"}=$n;
                                $cln[$j]="$nn-$nn2";$j++;
#                                print  FD_log "$n $nn $nn2 \n";

                        }

                }
my @m_res;
my @key =sort {$cluster->{$b} <=> $cluster->{$a}} keys %{$cluster};
        for(0 .. (scalar(@key))-1)
        {
                my $itmp=$_;
#                print FD_log $key[$itmp],"-",$cluster->{$key[$itmp]},"\n";
                push @m_res,"$key[$itmp]-$cluster->{$key[$itmp]}\n";
        }
unlink("/tmp/$rand.sctxt2");
                close(FD_arr);
        @m_res;
}
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.