zhangbo-NLP-plugin_eng/lib/zhangbo/NLP/get_baike_com_all.pl
#!/usr/bin/perl
use URI::Escape;
use lib "./";
#use Save;
use HTML::TokeParser;
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;
use HTTP::Cookies;
#use HTTP::Cookies::Guess;
#use MongoDB;
use Encode;
use Data::Dumper;
#use DBI;
#use SQL::Translator::Parser::SQLServer;
use POSIX;
#use Protocal;
use Data::Dumper;
#my $t = $mongo_dbh->get_collection("art");
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->default_header('Accept-Language' => "zh-cn,cn");
$ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
$ua->cookie_jar(HTTP::Cookies->new('file'=>'./cookie.lwp','autosave'=>1));
my @acou;
my @bcou;
$id=1;
$song_id=1;
$hasnext=1;
$nname=$ARGV[0];
$uri=uri_escape( $ARGV[0]);
#print $uri;
#$base=qw(http://www.baidu.com/s?wd=);
$base=qw(http://www.baike.com/wiki/);
#$base=qw(http://baike.baidu.com/search?word=);
#http://www.baidu.com/s?wd=%E8%B7%AF%E6%98%93%E5%A8%81%E7%99%BB
$base=$base.$uri;
# <a href="http://baike.baidu.com/view/363031.htm#sub5192786" target="_blank"><em>文章</em>_百度百科</a>
#%E9%99%88%E5%B8%8C%E5%90%8C&type=0&pn=0&rn=10&submit=search
#$base=qw(http://www.12530.com/newweb/jsp/v4_rank/e_sing_top50.jsp?pageNo=);
#$base2=qw(&id=787535&flash=0.08282932250886244&obj=2);
# print "Content-type: text/html\n\n";
$url=$base;
#$url=$base."$id".$base2;
#print $url,"\n";
$id++;
#print $url."\n";
#$t->save($tmp);
$ena=decode('utf-8',$ARGV[0]);
$ena=$ARGV[0];
#print("grep \"^$ena\\s\" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt\n");
#$list=system("grep \"^$ena\\s\" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt");
#print $list;
#print "\n";
#grep "^刘强东" /usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt
#if($list ==256)
#{
#open(FD ,">>/usr/local/lib//python2.7/dist-packages/jieba-0.31-py2.7.egg/jieba/dict.txt");
#print FD "$ARGV[0] 25 n\n";
#close FD;
#}
get_song($url);
sub get_song
{
$tturl=$_[0];
$pnum=0;
my $response = $ua->get("$tturl");
if ($response->is_success) {
# print $response->decoded_content; # or whatever
}
else {
die $response->status_line;
#$t2->insert({"name"=>$name,"error"=>"no respons"});
}
$cont=$response->decoded_content;
$p = HTML::TokeParser->new(\$cont);
$start=0;
$stop=0;
$typstart=0;
while (my $token = $p->get_tag()) {
my $ab_url = $token->[1]{href} ||"-";
# my $ab_url = $token->[1]{href} ||"-";
my $typ= $token->[0];
my $text = $p->get_trimmed_text();
$text=encode("utf-8",$text);
#print "token $text |$typ| $token->[1]{href}\n";
#开放分类: |p|
if(($text=~/开放分类:/)&&($typ=~/^p/))
{
$typstart=1;
next;
}
#script
if($typ=~/script/)
{
next;
}
#token 图片 |a|
if(($text=~/图片/)&&($typ=~/^a/)&&($typstart==1))
{
$typstart=0;
next;
}
if(($text=~/移动说客/)&&($typ=~/^\/em/))
{
$start=1;
next;
}
if(($text=~/相关文献/)&&($typ=~/^h3/))
{
$start=0;
next;
}
if(($text=~/参考资料/)&&($typ=~/^dt/))
{
$start=0;
last;
}
if(($text=~/万方数据/)&&($typ=~/span/))
{
$start=0;
last;
}
#本词条尚需完善
if($text=~/本词条尚需完善/)
{
$start=0;
return 0;
}
if($typstart==1)
{
push(@bcou,"$text ");
}
elsif($start==1)
{
push(@acou,$text);
}
}
$bc=join("",@acou);
$rand=rand();
open(FD,">$rand");
print "----分类 @bcou ------------\n";
print FD "$bc\n";
#system("./sub_key.pl $rand babel_add.txt $nname" );
close FD;
my $res=`cat $rand`;
unlink($rand);
$nlin=`tail -1 zhidao_qa.txt`;
my $lid;
if($nlin=~/id=(.*?)\t/)
{
$lid=$1;
$lid++;
}
else
{
$lid=0;
}
print length($res)." res---------$res------\n";
$type=join("/",@bcou);
#$type=json("/",@bcou);
open(FDb,">>zhidao_qa.txt");
print FDb "id=$lid\t$nname\t$type\t@bcou\n";
#print "$bc\n";
#print "$bc\n";
#print "$bc\n";
close FDb;
open(FDb,">>zhidao_q.txt");
print FDb "id=$lid $nname\n";
#print "$bc\n";
#print "$bc\n";
#print "$bc\n";
close FDb;
}
#token 万方数据 |span|
#token 移动说客 |/em|
#token 相关文献 |h3|
#token 参考资料: |dt|
#print $text,"\n";
#print "--------------------\n";
# if($ab_url=~/baike.baidu.com/)
#if(($text=~/百度百科/)&&(length($ab_url)>10))
# {
# print $ab_url,"\n";
#print("./parser_baike.pl $ab_url $nname\n");
#$rand=rand(10);
#system("./parser_baike.pl $ab_url $nname >$rand;");
# $j=`cat $rund`;
# if(length($j)>50)
# {
#system("./sub_key.pl $rand babel_add.txt $nname" );
#unlink($rand);
# exit;
# }
#unlink($rand);
#}
#print "$typ\t$ab_url\t$text\t$more\t$more2\n";