Group
Extension

zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/get_baike_com_all.pl

#!/usr/bin/perl
use URI::Escape;
use lib "./";
#use Save;
use HTML::TokeParser;
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;


use HTTP::Cookies;
#use HTTP::Cookies::Guess;
#use  MongoDB;
use Encode;
use Data::Dumper;
#use DBI;
#use SQL::Translator::Parser::SQLServer;

use POSIX;
#use Protocal;
use Data::Dumper;
#my $t = $mongo_dbh->get_collection("art");
my $ua = LWP::UserAgent->new;
        $ua->timeout(10);
        $ua->env_proxy;
        $ua->default_header('Accept-Language' => "zh-cn,cn");

        $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
        $ua->cookie_jar(HTTP::Cookies->new('file'=>'./cookie.lwp','autosave'=>1));

my @acou;
my @bcou;
$id=1;
$song_id=1;
$hasnext=1;
$nname=$ARGV[0];
$uri=uri_escape( $ARGV[0]);
#print $uri;
#$base=qw(http://www.baidu.com/s?wd=);
$base=qw(http://www.baike.com/wiki/);
#$base=qw(http://baike.baidu.com/search?word=);
#http://www.baidu.com/s?wd=%E8%B7%AF%E6%98%93%E5%A8%81%E7%99%BB
$base=$base.$uri;
#       <a href="http://baike.baidu.com/view/363031.htm#sub5192786" target="_blank"><em>文章</em>_百度百科</a>

#%E9%99%88%E5%B8%8C%E5%90%8C&type=0&pn=0&rn=10&submit=search
#$base=qw(http://www.12530.com/newweb/jsp/v4_rank/e_sing_top50.jsp?pageNo=);
#$base2=qw(&id=787535&flash=0.08282932250886244&obj=2);
# print "Content-type: text/html\n\n";
	$url=$base;
	#$url=$base."$id".$base2;
	#print $url,"\n";
	
		$id++;
	
	#print $url."\n";
	#$t->save($tmp);

$ena=decode('utf-8',$ARGV[0]);
$ena=$ARGV[0];
#print("grep \"^$ena\\s\" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt\n");
#$list=system("grep \"^$ena\\s\" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt");

#print $list;
#print "\n";

#grep "^刘强东" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt
#if($list ==256)
#{
#open(FD ,">>/usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt");
#print FD "$ARGV[0] 25 n\n";
#close FD;
#}
			get_song($url);	
sub get_song
{
$tturl=$_[0];
$pnum=0;
 my $response = $ua->get("$tturl");
        if ($response->is_success) {
#            print $response->decoded_content;  # or whatever
        }
        else {
            die $response->status_line;
                #$t2->insert({"name"=>$name,"error"=>"no respons"});
        }
$cont=$response->decoded_content;
         $p = HTML::TokeParser->new(\$cont);
       $start=0;
        $stop=0;
	$typstart=0;
          while (my $token = $p->get_tag()) {

                     my $ab_url = $token->[1]{href} ||"-";
#                     my $ab_url = $token->[1]{href} ||"-";
             my $typ= $token->[0];
             my $text = $p->get_trimmed_text();
                $text=encode("utf-8",$text);

#print "token  $text |$typ|  $token->[1]{href}\n";
#开放分类: |p|
if(($text=~/开放分类:/)&&($typ=~/^p/))
{
	$typstart=1;
	next;
}
#script
if($typ=~/script/)
{
        next;
}


#token  图片 |a|
if(($text=~/图片/)&&($typ=~/^a/)&&($typstart==1))
{
        $typstart=0;
	next;
}
if(($text=~/移动说客/)&&($typ=~/^\/em/))
{
        $start=1;
	next;
}
if(($text=~/相关文献/)&&($typ=~/^h3/))
{
        $start=0;
	next;
}

if(($text=~/参考资料/)&&($typ=~/^dt/))
{
        $start=0;
	last;
}
if(($text=~/万方数据/)&&($typ=~/span/))
{
        $start=0;
	last;
}
#本词条尚需完善
if($text=~/本词条尚需完善/)
{
        $start=0;
	return 0;
}


	if($typstart==1)
	{
	push(@bcou,"$text ");
	}
	elsif($start==1)
	{
        push(@acou,$text);
	}
}

 $bc=join("",@acou);
$rand=rand();
open(FD,">$rand");
print "----分类 @bcou ------------\n";
print FD "$bc\n";
#system("./sub_key.pl $rand babel_add.txt $nname" );
close FD;
my $res=`cat $rand`;

unlink($rand);


$nlin=`tail -1 zhidao_qa.txt`;
my $lid;
if($nlin=~/id=(.*?)\t/)
{
        $lid=$1;
        $lid++;
}
else
{
$lid=0;
}
print length($res)." res---------$res------\n";

$type=join("/",@bcou);
#$type=json("/",@bcou);

open(FDb,">>zhidao_qa.txt");
print FDb "id=$lid\t$nname\t$type\t@bcou\n";
#print "$bc\n";
#print "$bc\n";
#print "$bc\n";
close FDb;
open(FDb,">>zhidao_q.txt");
print FDb "id=$lid $nname\n";
#print "$bc\n";
#print "$bc\n";
#print "$bc\n";
close FDb;

}



#token  万方数据 |span|


#token  移动说客 |/em|
#token  相关文献 |h3|
#token  参考资料: |dt|

#print $text,"\n";
#print "--------------------\n";

#		if($ab_url=~/baike.baidu.com/)
		#if(($text=~/百度百科/)&&(length($ab_url)>10))
#		{
#		print $ab_url,"\n";
#print("./parser_baike.pl $ab_url $nname\n");
#$rand=rand(10);
#system("./parser_baike.pl $ab_url $nname >$rand;");
#		$j=`cat $rund`;
#		if(length($j)>50)
#		{
#system("./sub_key.pl $rand babel_add.txt $nname" );
#unlink($rand);
#		exit;

#		}
#unlink($rand);


		#}


#print "$typ\t$ab_url\t$text\t$more\t$more2\n";


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.