Yet Another CPAN Grep

Bib-Tools/lib/Bib/Tools.pm

############################################################
#
#   Bib::Tools - For managing collections of Bib::CrossRef references.
#
############################################################

package Bib::Tools;

use 5.8.8;
use strict;
use warnings;
no warnings 'uninitialized';

require Exporter;
use Bib::CrossRef;
use LWP::UserAgent;
use JSON qw/decode_json/;
use URI::Escape qw(uri_escape_utf8 uri_unescape);
use HTML::Entities qw(decode_entities encode_entities);
use HTML::TreeBuilder::XPath;
use XML::Simple qw(XMLin);
use BibTeX::Parser qw(new next);
use IO::File;
use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS @ISA);

#use LWP::Protocol::https;
#use Data::Dumper;

$VERSION = '0.17';
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(
sethtml clearhtml add_details add_google add_google_search add_orcid add_fromfile add_dblp add_pubmed
send_resp print print_nodoi num num_nodoi getref getref_nodoi append add_bibtex
);
%EXPORT_TAGS = (all => \@EXPORT_OK);

####################################################################################
sub new {
    my $self;
    # defaults
    $self->{refs} = []; # the references
    $self->{nodoi_refs} = []; 
    $self->{duprefs} = [];
    $self->{html}=0;
    $self->{ratelimit}=5; # limit of 5 crossref queries per sec
    $self->{last} = {};
    bless $self;
  
    my $ratelimit = $_[1];
    if (defined($ratelimit) && ($ratelimit>=0)) {$self->{ratelimit}=$ratelimit};
    return $self;
}

####################################################################################
sub sethtml {
  my $self = shift @_;
  $self->{html}=1;
}

####################################################################################
sub clearhtml {
  my $self = shift @_;
  $self->{html}=0;
}

####################################################################################
sub _err {
  my ($self, $str) = @_;
  if ($self->{html}) {
    print "<p style='color:red'>",$str,"</p>";
  } else {
    print $str,"\n";
  }
}

####################################################################################
sub _split_duplicates {
  # split list of references into three lists: one with refs that have unique doi's, one with no doi's
  #and one with all the rest (with duplicate doi's)
  my $self = shift @_;
  my @refs=@{$self->{refs}};
  
  my @newrefs;
  foreach my $ref (@refs) {
    my $doi = $ref->doi();
    if (!defined($doi) || length($doi)==0) {push @{$self->{nodoi_refs}}, $ref; next; }# skip entries with no DOI
    my $found = 0;
    foreach my $ref2 (@newrefs) {
      if ($ref2->doi() eq $doi) {
        $found = 1;
      }
    }
    if (!$found) {
      push @newrefs, $ref;
    } else {
      push @{$self->{duprefs}}, $ref;
    }
  }
  $self->{refs} = \@newrefs;
}

####################################################################################
sub append {
  # add new reference to end of existing list
  my $self = shift @_;
  my $ref = shift @_;
  push @{$self->{refs}}, $ref;
}

####################################################################################
sub add_details {
  # given an array of raw strings, try to convert into paper references
  
  my $self = shift @_;
  foreach my $cites (@_) {
    $self->{last} = Bib::CrossRef->new();
    $self->{last}->parse_text($cites);
    $self->append($self->{last});
    sleep 1/(0.001+$self->{ratelimit}); # rate limit queries to crossref
  }
  $self->_split_duplicates();
}

####################################################################################
sub add_google {
  # scrape paper details from google scholar personal page -- nb: no doi info on google, so use crossref.org to obtain this
  # nb: doesn't work with google scholar search results
  
  my $self = shift @_;
  my $url = shift @_;
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse($res->decoded_content);
    my @atitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/a[@class="gsc_a_at"]');
    my @authors=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][1]');
    my @jtitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][2]');
    my $len1 = @atitles; my $len2 = @authors; my $len3 = @jtitles;
    if (($len1 != $len2) || ($len1 != $len3) || ($len2 != $len3)) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors/$len3 journals.");return []}
    for (my $i = 0; $i<$len1; $i++) {
      # these are already utf8
      $authors[$i] = decode_entities($authors[$i]);
      $atitles[$i] = decode_entities($atitles[$i]);
      $jtitles[$i] = decode_entities($jtitles[$i]);
      my $temp = $authors[$i].", ".$atitles[$i].", ".$jtitles[$i];
      my $r = Bib::CrossRef->new;
      $r->parse_text($temp);
      $jtitles[$i] =~ m/\s([0-9][0-9][0-9][0-9])$/;
      my $year=$1;
      if ((length($year)==4) && ($r->date ne $1)) {
        $r->_setscore(0.5); # mismatch in year, probably bad
      }
      $self->append($r);
    }
  } else {
    $self->_err("Problem with $url: ".$res->status_line);
  }
}

####################################################################################
sub add_google_search {
  # scrape paper details from google scholar search results -- *not* from persons scholar home page

  my $self = shift @_;
  my $url = shift @_;
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse($res->decoded_content);
    my @atitles=$tree->findvalues('//div[@class="gs_ri"]/h3/a');
    my @authors=$tree->findvalues('//div[@class="gs_a"]');
    my $len1 = @atitles; my $len2 = @authors;
    if ($len1 != $len2) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors.");return [];}
    my @cites=();
    for (my $i = 0; $i<$len1; $i++) {
      $authors[$i] = decode_entities($authors[$i]);
      $atitles[$i] = decode_entities($atitles[$i]);
      my $str = $authors[$i].", ".$atitles[$i];
      if (length($str)>5) { # a potentially useful entry ?
        push @cites, $authors[$i].", ".$atitles[$i];
      }
    }
    $self->add_details(@cites);
  } else {
    $self->_err("Problem with $url: ".$res->status_line);
  }
}

####################################################################################
sub _dblp_setauth {
  my $self = shift @_; my $r = shift @_; my $cite = shift @_;
  
  if (ref($cite->{'author'}) eq "HASH") {
    $r->_setauthcount(1);
    $r->_setauth(1,$cite->{'author'});
  } else {
    my $count = 0;
    foreach my $au (@{$cite->{'author'}}) {
      $count++;
      $r->_setauth($count, $au);
    }
    $r->_setauthcount($count);
  }
}

####################################################################################
sub add_dblp {
  # get details using DBLP XML API
  
  my $self = shift @_;
  my $url = shift @_;
  my $maxnum = shift @_; if (!defined($maxnum)) {$maxnum=-1;}
  
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $xs = XML::Simple->new();
    my $data = $xs->XMLin($res->decoded_content);
    my @cites; my @ctemp;
    if (defined $data->{'r'}) {
       # a person page
       @cites = $data->{'r'};
    } elsif (defined $data->{'article'}) { 
       # its xml for a single article
       $ctemp[0] = $data;
       push @cites, \@ctemp;
    }
    my $num=0;
    foreach my $c (@{$cites[0]}) {
      $num++; if ($maxnum>0 && $num>$maxnum) {last;}  # mainly for testing
      my @k = keys %{$c};
      my $cite = $c->{$k[0]};
      my $ee = $cite->{'ee'};
      if ($ee =~ m/dx.doi.org/) {
        # we have a DOI, lets call crossref
        $ee =~ s/http:\/\/dx.doi.org\///;
        my $r = Bib::CrossRef->new;
        $r->parse_text($ee);
        if ($r->score >=1) {
          if (!defined $r->authcount || $r->authcount==0) {
            # shouldn't happen, but sometimes doi data lacks authors so use dblp data
            $self->_dblp_setauth($r,$cite);
          }
          $self->append($r);
          next; # move on to next record
        }
      }
      my $jtitle='';
      if (defined $cite->{'journal'}) {
        $jtitle = $cite->{'journal'};
      } elsif (defined $cite->{'booktitle'}) {
        $jtitle = $cite->{'booktitle'};
      }
      my $temp = $cite->{'year'}.' '.$cite->{'title'}.' '.$jtitle. ' ';
      if (ref($cite->{'author'}) eq "HASH") {
        $temp .= $cite->{'author'};
      } else {
        foreach my $au (@{$cite->{'author'}}) { $temp .= $au.", ";}
      }
      my $r = Bib::CrossRef->new;
      $r->parse_text($temp);
      if ($r->score >= 1) {
        # found an ok match, lets use it
        $self->append($r);
        next; # move on
      }
      
      # we got a poor match, lets use the rest of the dblp data
      $r = Bib::CrossRef->new;
      if (exists $cite->{'publtype'}) {
        $r->_setgenre($cite->{'publtype'});
      } elsif ($k[0] =~ m/article/) {
        $r->_setgenre('article');
      } elsif ($k[0] =~ m/inproceedings/) {
        $r->_setgenre('proceeding');
      } elsif ($k[0] =~ m/informal/) {
        $r->_setgenre('preprint');
      } else {
        $r->_setgenre($k[0]);
      }
      $r->_setdate($cite->{'year'}); $r->_setatitle($cite->{'title'}); $r->_setjtitle($jtitle);
      $self->_dblp_setauth($r,$cite);
      if (defined $cite->{'volume'}) {$r->_setvolume($cite->{'volume'});}
      if (defined $cite->{'number'}) {$r->_setissue($cite->{'number'});}
      if (defined $cite->{'pages'}) {
        my @bits = split('-',$cite->{'pages'});
        if (defined $bits[0]) {$r->_setspage($bits[0]);}
        if (defined $bits[1]) {$r->_setepage($bits[1]);}
      }
      if (($cite->{'ee'} =~ m/^http:\/\//))  {$r->_seturl($cite->{'ee'});}
      $r->_setscore(1);
      $r->_setquery($temp);
      # add manually constructed record
      $self->append($r);
    }
    $self->_split_duplicates();
  } else {
    $self->_err("Problem with $url: ".$res->status_line);
  }
}

####################################################################################
sub _orcid_getdoi {
  # extract DOI from an orcid entry
  my $cite = shift @_;
  
  my $doi='';
  my $c = $cite->{'work-external-identifiers'}->{'work-external-identifier'};
  if (ref($c) eq "HASH") {
    # a single value
    if ($c->{'work-external-identifier-type'} =~ m/doi/) {
      # and its a DOI
      $doi =  $c->{'work-external-identifier-id'};
    }
  } else {
    # multiple values
    foreach my $id (@{$c}) {
      if ($id->{'work-external-identifier-type'} =~ m/doi/) {
        # its a DOI
        $doi =  $id->{'work-external-identifier-id'};
        last; # exit loop
      }
    }
  }
  return $doi;
}

####################################################################################
sub _orcid_getauth {
  # generate an author string from an orcid entry
  my $cite = shift @_;

  my $auth='';
  my $c = $cite->{'work-contributors'}->{'contributor'};
  if (ref($c) eq "HASH") {
    # single author
    if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth = $c->{'credit-name'}->{'content'};}
  } else {
    # multiple authors
    foreach my $au (@{$c}) {
      if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth .= $au->{'credit-name'}->{'content'}.", ";}
    }
  }
  return $auth;
}

####################################################################################
sub _orcid_setauth {
  # use an orcid entry to set citation author list (using orcid bibtex data if appropriate)
  my $self = shift @_; my $r = shift @_;
  my $cite = shift @_; my $entry = shift @_;
  
  my $c = $cite->{'work-contributors'}->{'contributor'};
  if (defined $c) {
    # we have an orcid author entry
    my $authcount=0;
    if (ref($c) eq "HASH") {
      # single author
      if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
        $authcount++;
        $r->_setauth($authcount,$c->{'credit-name'}->{'content'});
      }
    } else {
      # multiple authors
      foreach my $au (@{$c}) {
        if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
          $authcount++;
          $r->_setauth($authcount,$au->{'credit-name'}->{'content'});
        }
      }
    }
    $r->_setauthcount($authcount);
    if ($authcount>0) {return;} # found some authors, finish up
  }
  # no author info, lets see if bibtex has any author info
  if (defined $entry) {
    $self->_bibtex_setauth($r,$entry);
  }
}

####################################################################################
sub add_orcid {
  # get paper details from orcid using API
  
  my $self = shift @_; my $orcid_id = shift @_;
  
  my $ua = LWP::UserAgent->new;
  my $req = HTTP::Request->new(GET => "http://pub.orcid.org/$orcid_id/orcid-works/");
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $xs = XML::Simple->new();
    # the orcid response is utf8 xml
    my $data = $xs->XMLin($res->decoded_content);
    my @cites = $data->{'orcid-profile'}->{'orcid-activities'}->{'orcid-works'}->{'orcid-work'};
    foreach my $cite (@{$cites[0]}) {
      my $entry = undef;
      if ($cite->{'work-citation'}->{'work-citation-type'} =~ m/bibtex/) {
        # we have a bibtex reference, extract some extra info
        my $bibtex = $cite->{'work-citation'}->{'citation'};
        open my $fh, '<', \$bibtex;
        my $parser = BibTeX::Parser->new($fh);
        $entry = $parser->next;
        if (!$entry->parse_ok) {$entry = undef;}
      }
      my $doi = _orcid_getdoi($cite);
      if ((defined $doi) && (length($doi)>5)) { # we seem to have a DOI
        # use DOI to search.crossref.org
        my $r = Bib::CrossRef->new;
        $r->parse_text($doi);
        if ($r->score>=1) {
          if (!defined $r->authcount || $r->authcount==0) {
            # shouldn't happen, but sometimes doi data lacks authors so use orcid data
            $self->_orcid_setauth($r,$cite,$entry);
          }
          $self->append($r);
          next;  # move on
        }
      }
      # use title etc to search.crossref.org
      my $date; my $atitle; my $jtitle;
      if (exists $cite->{'publication-date'}->{'year'}) {$date = $cite->{'publication-date'}->{'year'};}
      if (exists $cite->{'work-title'}->{'title'}) {$atitle = $cite->{'work-title'}->{'title'};}
      if (exists $cite->{'journal-title'}) {$jtitle = $cite->{'journal-title'};}
      my $auth = _orcid_getauth($cite);
      my $temp=$auth.' '.$date.' '.$atitle.' '.$jtitle;
      if (length($temp)>10 && length($date)>0 && length($atitle)+length($auth)>0) { # we have a potentially useful search string
        my $r = Bib::CrossRef->new;
        $r->parse_text($temp);
        if ($r->score >= 1) {
          # found an ok match, lets use it
          $self->append($r);
          next; # move on
        }
      }
      # for a poor match, try to extract rest of info from orcid
      my $r = Bib::CrossRef->new;
      $r->_setdate($date); $r->_setatitle($atitle); $r->_setjtitle($jtitle);
      if (exists $cite->{'work-type'}) {$r->_setgenre($cite->{'work-type'});}
      $self->_orcid_setauth($r,$cite);
      $self->_bibtex_parse($r,$entry);
      $r->_setscore(1);
      $r->_setquery($temp);
      # add manually constructed record
      $self->append($r);
    }
    $self->_split_duplicates();
  } else {
    $self->_err("Problem with orcid.org: ".$res->status_line);
  }
}

####################################################################################
sub _find_pubmed {
  my $c = shift @_;
  my $name = shift @_;
  my $term = shift @_;
  foreach my $item (@{$c}) {
    if ($item->{'Name'} eq $name) {
      return $item->{$term};
    }
  }
  return undef;
}

####################################################################################
sub add_pubmed {
  # add results from a pubmed query
  my ($self,$q) = @_;

  my $ua = LWP::UserAgent->new;
  $q =~ s/\s+/+/g;
  my $req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?usehistory=y&db=pubmed&term=".$q);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $web = $1 if ($res->decoded_content =~ /<WebEnv>(\S+)<\/WebEnv>/);
    my $key = $1 if ($res->decoded_content =~ /<QueryKey>(\d+)<\/QueryKey>/);
    $req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&query_key=$key&WebEnv=$web");
    $res = $ua->request($req);
    if ($res->is_success) {
      my $xs = XML::Simple->new();
      my $data = $xs->XMLin($res->decoded_content);
      my @cites = $data->{'DocSum'};
      foreach my $cite (@{$cites[0]}) {
        my $c = $cite->{'Item'};
        if (ref($c) ne "ARRAY") {next;}
        my $r = Bib::CrossRef->new;
        my $doi = _find_pubmed($c,'DOI','content');
        if (defined $doi) {
          # PubMed is reliable, no need to call crossref
        #  my $r = Bib::CrossRef->new;
        #  $r->parse_text($doi);
        #  $self->append($r);
        #  next; # move on
          $r->_setdoi($doi);
          $r->_seturl('http://dx.doi.org/'.$doi);
        }
        $r->_setjtitle(_find_pubmed($c,'FullJournalName','content'));
        $r->_setatitle(_find_pubmed($c,'Title','content'));
        my $date = _find_pubmed($c,'PubDate','content');
        $date =~ m/^([0-9][0-9][0-9][0-9])/;
        $r->_setdate($1); # extract the year
        $r->_setvolume(_find_pubmed($c,'Volume','content'));
        $r->_setissue(_find_pubmed($c,'Issue','content'));
        my $p = _find_pubmed($c,'Pages','content');
        my @bits = split('-',$p);
        $r->_setspage($bits[0]); $r->_setepage($bits[1]);
        
        my $aulist = _find_pubmed($c,'AuthorList','Item');
        my $authcount=0;
        if (ref($aulist) ne "ARRAY") {
          $authcount = 1;
          $r->_setauth($authcount,$aulist->{'content'});
        } else {
          foreach my $au (@{$aulist}) {
            $authcount++;
            $r->_setauth($authcount,$au->{'content'});
          }
        }
        $r->_setauthcount($authcount);
        my $g = _find_pubmed($c,'FullJournalName','Item');
        $r->_setgenre($g->{'content'});
        $r->_setscore(1);
        #$r->_setquery($auth." ".$temp);
        # add manually constructed record
        $self->append($r);
      }
    }
    $self->_split_duplicates();
    return;
  }
  $self->_err("Problem with http://eutils.ncbi.nlm.nih.gov: ".$res->status_line);
}

####################################################################################
sub _rem_brackets {
  # remove {} brackets from bibtex entry
  my $self = shift @_; my $str = shift @_;
  
  $str =~ s/[\{\}]//g;
  $str =~ s/\\textquotesingle/\\'/g;
  return $str
}

####################################################################################
sub _bibtex_setauth {
  # use author list array to populate our author details
  my $self = shift @_; my $r = shift @_; my $entry = shift @_;

  if ((!defined $r) || (!defined $entry) || (!$entry->parse_ok)) {return;}

  my @authors = $entry->author;
  my $count = 0;
  foreach my $author (@authors) {
    $count++;
    $r->_setauth($count,$self->_rem_brackets($author->first).' '.$self->_rem_brackets($author->last));
  };
  $r->_setauthcount($count);
}

####################################################################################
sub _bibtex_parse {
  # use data in a bibtext entry to populate citation
  my $self = shift @_; my $r = shift @_; my $entry = shift @_;
  
  if ((!defined $r) || (!defined $entry) || (!$entry->parse_ok)) {return;}
  
  my $genre = lc($entry->type);
  if (($genre eq "inproceedings") || ($genre eq "proceedings")) {
    $r->_setgenre("proceeding");
  } else {
    $r->_setgenre($genre);
  }
  if (defined $entry->field('title')) { $r->_setatitle($self->_rem_brackets($entry->field('title')));}
  if (defined $entry->author) {$self->_bibtex_setauth($r,$entry);}
  if (defined $entry->field('url')) {$r->_seturl($entry->field('url'))};
  if (defined $entry->field('year')) {$r->_setdate($entry->field('year'))};
  if (defined $entry->field('volume')) {$r->_setvolume($entry->field('volume'))};
  if (defined $entry->field('issue')) {$r->_setissue($entry->field('issue'))};
  if (defined $entry->field('pages')) {
    my $pages = $entry->field('pages');
    (my $s, my $e) = ($pages =~ /([0-9]+)-+([0-9]+)/ );
    if (defined $s) { $r->_setspage($s); }
    if (defined $e) { $r->_setepage($e); }
  }
  if (defined $entry->field('journal')) {
    $r->_setjtitle($self->_rem_brackets($entry->field('journal')));
  } elsif (defined $entry->field('booktitle')) {
    $r->_setjtitle($self->_rem_brackets($entry->field('booktitle')));
  }
  if (defined $entry->field('doi')) {$r->_setdoi($entry->field('doi'));}
}

####################################################################################
sub add_bibtex {
  # read references from a bibtex file, takes file handle as input
  my $self = shift @_; my $fh = shift @_;
  my $opt = shift @_;  # options, =1 then use crossref to try to resolve DOIs
  if (!defined $opt) {$opt=1;}  # $opt defaults to 1
  
  my $parser = BibTeX::Parser->new($fh);
  while (my $entry = $parser->next ) {
    if (!$entry->parse_ok) { next; } # problem, move on
    my $doi = $entry->field('doi');
    if ((defined $doi) && (length($doi)>5) && ($opt)) {
      # we have a DOI and $opt != 0, use DOI to get rest of citation
      my $r = Bib::CrossRef->new;
      $r->parse_text($doi);
      if ($r->score >=1) {
        if (!defined $r->authcount || $r->authcount==0) {
          # shouldn't happen, but sometimes doi data lacks authors.  since we have bibtex data, use it
          $self->_bibtex_setauth($r,$entry);
        }
        $self->append($r);
        next;  # move on
      }
    }
    my $r = Bib::CrossRef->new;
    # use bibtex to create citation
    $self->_bibtex_parse($r,$entry);
    $r->_setscore(1);
    $r->_setquery($r->date." ".$r->atitle." ".$r->jtitle);
    # add manually constructed record
    $self->append($r);
  }
  $self->_split_duplicates();
}

####################################################################################
sub add_fromfile {
  # read free text references from a file, one reference per line
  # takes file handle as input
  my $self = shift @_;
  my $fh = shift @_;
  my @cites;
  while (my $line=<$fh>) {
    chomp($line);
    if (length($line)<5) {next;}  # skip non-informative lines
    push @cites, $line;
  }
  $self->add_details(@cites);
}

####################################################################################
sub num {
  # number of references with DOIs
  my $self = shift @_;
  
  my $len = @{$self->{refs}};
  return $len;
}

sub num_nodoi {
  # number of references without DOIs
  my $self = shift @_;
  
  my $len = @{$self->{nodoi_refs}};
  return $len;
}

####################################################################################
sub getref {
 # get i'th reference with a DOI
 my ($self, $i) = @_;
 return ${$self->{refs}}[$i];
}

####################################################################################
sub getref_nodoi {
 # get i'th reference without a DOI
 my ($self, $i) = @_;
 
 return ${$self->{nodoi_refs}}[$i];
}

####################################################################################
sub print {
  # display a list of references
  my $self = shift @_;
  my $id = shift @_;
  
  if ($self->num==0) {return ''};
  my $out='';
  if ($self->{html}) {$out.=$self->getref(0)->printheader($id);}
  for (my $i=0; $i< $self->num; $i++) {
    if ($self->{html}) {$self->getref($i)->sethtml;} else {$self->getref($i)->clearhtml;}
    $out .= $self->getref($i)->print($i+1);
    $out .= "\n";
  }
  if ($self->{html}) {$out.=$self->getref(0)->printfooter;}
  return $out;
}

####################################################################################
sub print_nodoi {
  # display a list of references
  my $self = shift @_;
  my $id = shift @_;
  
  if ($self->num_nodoi==0) {return ''};
  my $out='';
  if ($self->{html}) {$out.=$self->getref_nodoi(0)->printheader($id);}
  for (my $i=0; $i< $self->num_nodoi; $i++) {
    if ($self->{html}) {$self->getref_nodoi($i)->sethtml;} else {$self->getref_nodoi($i)->clearhtml;}
    $out .= $self->getref_nodoi($i)->print($i+1);
  }
  if ($self->{html}) {$out.=$self->getref_nodoi(0)->printfooter;}
  return $out;
}

####################################################################################
sub send_resp {
  # generate simple web page with results ...
  
  my $self = shift @_;

  if ($self->num==0 && $self->num_nodoi==0) {return 'No Results'};
  my $html = $self->{html};
  $self->sethtml; # force use of html
  my $out='';
  #$out.="Content-Type: text/html;\n\n"; # html header
  $out.=sprintf "%s", '<!DOCTYPE HTML>',"\n";
  $out.=sprintf "%s", '<html><head><meta charset="utf-8"><meta http-equiv="Content-Type">';
  $out.=sprintf "%s", '<script src="post.js"></script></head><body>',"\n";
  $out.=sprintf "%s", $self->print('doi');
  if ($self->num_nodoi>0) {
    $out.=sprintf "%s", '<h3>These have no DOIs:</h3>',"\n";
    $out.=sprintf "%s", $self->print_nodoi('nodoi');
  }
  $out.=sprintf "%s", '<input id="Submit" type="button" value="Submit" onclick="GetCellValues(\'doi\');GetCellValues(\'nodoi\');" /><div id="out"></div>';
  $out.=sprintf "%s", '</body></html>';
  $self->{html} = $html; # restore previous setting
  return $out;
}

1;

=pod
 
=head1 NAME
 
Bib::Tools - For managing collections of Bib::CrossRef references.
 
=head1 SYNOPSIS

 use strict;
 use Bib::Tools;
 
# Create a new object

 my $refs = Bib::Tools->new();
 
# Add some bibliometric info e.g. as text, one reference per line

 $text=<<"END";
 10.1109/lcomm.2011.040111.102111
 10.1109/tnet.2010.2051038
 END
 open $fh, '<', \$text;
 $refs->add_fromfile($fh);
 
 or 
 
 $text=<<"END";
 Dangerfield, I., Malone, D., Leith, D.J., 2011, Incentivising fairness and policing nodes in WiFi, IEEE Communications Letters, 15(5), pp500-502
 D. Giustiniano, D. Malone, D.J. Leith and K. Papagiannaki, 2010. Measuring transmission opportunities in 802.11 links. IEEE/ACM Transactions on Networking, 18(5), pp1516-1529
 END
 open $fh, '<', \$text;
 $refs->add_fromfile($fh);

# or as text scraped from a google scholar personal home page

 $refs->add_google('http://scholar.google.com/citations?user=n8dX1fUAAAAJ');

# or as text obtained from ORCID (www.orcid.org)

 $refs->add_orcid('0000-0003-4056-4014');

# or as text from PubMed

 $refs->add_pubmed('mills kh[author]');
 
# or as text from DBLP

 $refs->add_dblp('http://www.informatik.uni-trier.de/~ley/pers/xx/l/Leith:Douglas_J=');

# Bib:Tools will use Bib:CrossRef to try to resolve the supplied text into full citations.  It will try to 
detect duplicates using DOI information, so its fairly safe to import from multiple sources without creating 
clashes.  Full citations without DOI information are kept separately from those with a DOI for better quality
control.

# The resulting list of full citations containing DOI's can be printed out in human readable form using

 print $refs->print;

# and the list of full citations without DOI's

 print $refs->print_nodoi;

# or the complete citation list can also be output as a simple web page using

 print $refs->send_resp;

=head1 METHODS
 
=head2 new

 my $refs = Bib::Tools->new();

Creates a new Bib::Tools object.  Queries to crossref via Bib::CrossRef are rate limited.  To change the ratelimit pass this as
an option to new e.g $refs = Bib::Tools->new(3) sets the rate limit to 3 queries per second.

=head2 add_google

 $refs->add_google($url);
 
Scrapes citation information from a google scholar personal home page (*not* a search page, see below) and tries 
resolve it into full citations using crossref.

=head2 add_google_search

 $refs->add_google_search($url);
 
Scrapes citation information from a google scholar search page and tries to resolve into full citations.  A different
method is needed for search and home pages due to the different html tags used.

=head2 add_orcid

 $refs->add_orcid($orcid_id);
 
Uses the ORCID API to extract citations for the specified user identifier.  If possible, the DOI is obtained and then resolved using crossref.

=head2 add_dblp

 $refs->add_dblp($url);
 
Uses DBLP XML API to extract citations.  If possible, the DOI is obtained and then resolved using crossref.   E.g.

 $refs->add_dblp('http://www.informatik.uni-trier.de/~ley/pers/xx/l/Leith:Douglas_J=');
 
=head2 add_pubmed
 
 $refs->add_dblp($query);
 
Uses PubMed API to extract citations listed in response to a query. E.g.

 $refs->add_pubmed('mills kh[author]');
 
=head2 add_details

 $refs->add_details(@lines);
 
Given a array of strings, one per citation, tries to resolve these into full citations.

=head2 add_bibtex

 $refs->add_bibtex($fh, $opt);
 
Given a file handle to a file containing bibtex entries, imports these citations.  If a citation has a DOI and $opt is non-zero (the default), this
will be used to try to obtain the full citation from crossref.org.

=head2 add_fromfile

 $refs->add_fromfile($fh);
 
Given a file handle to a text file, with one citation per line, tries to resolve these into full citations.  
 
=head2 print

 my $info = $refs->print;

Display the list of full citations that have DOIs in human readable form.

=head2 print_nodoi

 my $info = $refs->print_nodoi;

Display the list of full citations without DOIs in human readable form.

=head2 sethtml

 $refs->sethtml
 
Set the output format to be html

=head2 clearhtml

 $refs->clearhtml
 
Set the output format to be plain text

=head2 send_resp

 my $info = $refs->send_resp;

=head2 num

 my $num = $refs->num;
 
Returns the number of full citations that have DOIs

=head2 num_nodoi

 my $num = $refs->num_nodoi;
 
Returns the number of full citations without DOIs

=head2 getref

 my $ref = $refs->getref($i);
 
Returns the $i citation from the list with DOIs.  This can be used to walk the list of citations.

=head2 getref_nodoi

 my $ref = $refs->getref_nodoi($i);

Returns the $i citation from the list without DOIs

=head2 append

  my $ref = Bib::CrossRef->new;
  $refs->append($ref);

Adds a Bib::CrossRef to end of a Bib::Tools list of objects

=head1 EXPORTS
 
You can export the following functions if you do not want to use the object orientated interface:

sethtml clearhtml add_details add_google add_google_search add_orcid add_fromfile add_dblp add_pubmed
send_resp print print_nodoi num num_nodoi getref getref_nodoi append add_bibtex

The tag C<all> is available to easily export everything:
 
 use Bib::Tools qw(:all);

=head1 WEB INTERFACE

A simple web interface to Bib::Tools is contained in the examples folder.  This consists of three files: query.html, handle_query.pl and post.js.

=head2 query.html

 <!DOCTYPE HTML>
 <html><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"></head><body>
 <div style="position:relative; height:600px; padding:10px; border:1px solid">
 <h3>Import References</h3>
 <form action="handle_query.pl" method="POST" id="in" enctype="multipart/form-data">
 <table>
 <tr><td>Use ORCID id:<br><small style="color:#C0C0C0">e.g. 0000-0003-4056-4014</small></td>
 <td><INPUT type="text" name="orcid" size="128"></p></td></tr>
 <tr><td colspan=2>(to import from Scopus, follow these <a href="http://orcid.scopusfeedback.com/">instructions</a>, and for Web of Science/ResearcherId follow <a href="http://wokinfo.com/researcherid/integration/">these</a>)</td></tr>

 <tr><td width="150px">Use DBLP XML page:<br>
 <small style="color:#C0C0C0">e.g. http://www.informatik.uni-trier.de/~ley/pers/xx/l/Leith:Douglas_J=</small></td>
 <td> <INPUT type="text" name="dblp" size="128"></td></tr>

 <tr><td>Use PubMed query:<br><small style="color:#C0C0C0">e.g. mills kh[author]</small></td>
 <td> <INPUT type="text" name="query" size="128"></td></tr>

 <tr><td>BibTeX file:</td>
 <td> <INPUT type="file" name="bibtex"></td></tr>

 <tr><td>Use Google Scholar personal page:<br>
 <small style="color:#C0C0C0">e.g. http://scholar.google.com/citations?user=n8dX1fUAAAAJ</small></td>
 <td><INPUT type="text" name="google" size="128"></p></td></tr>

 <tr><td>Use Google Scholar search page:<br>
 <small style="color:#C0C0C0">e.g. http://scholar.google.com/scholar?q=andr%C3%A9s+garcia+saavedra</small></td>
 <td><INPUT type="text" name="google2" size="128"></p></td></tr>

 </table>
 <p>Enter references, one per line (free form text):</p>
 <textarea name="refs" rows="10" cols="128" form="in"></textarea><br>
 <INPUT type="submit" value="Submit">(can be slow, be patient)
 </form></div>
 <div style="position: absolute; bottom: 5px;"><small>Source: <a href="http://search.cpan.org/~dougleith/Bib-Tools/">Bib::Tools</a></div>
 </body></html>
 
=head2 handle_query.pl

 #!/usr/bin/perl
 use Bib::CrossRef;
 use Bib::Tools;
 use CGI;

 # send html header
 print "Content-Type: text/html;\n\n";

 my $q = CGI->new;
 my $refs = Bib::Tools->new;
 my $orcid = scalar $q->param('orcid');
 $orcid =~ /([0-9\-]+)$/; # extract id out of url
 $orcid = $1;
 if (length($orcid) > 5) {
   $refs->add_orcid($1);
 }

 my $google = scalar $q->param('google'); #NB: CGI has already carried out URL decoding
 if (length($google) > 5) {
   if (!($google =~ m/^http/)) { $google = "http://".$google;}
   $refs->add_google($google);
 }

 my $google2 = scalar $q->param('google2'); #NB: CGI has already carried out URL decoding
 if (length($google2) > 5) {
   if (!($google2 =~ m/^http/)) { $google2 = "http://".$google2;}
   $refs->add_google_search($google2);
 }

 my $dblp = scalar $q->param('dblp');
 if (length($dblp) > 5) {
   if (!($dblp =~ m/^http/)) { $dblp = "http://".$dblp;}
   if ($dblp =~ m/http:\/\/dblp.uni-trier.de\/pers\/xx\/l\/.+/) {
      # looks like a valid dblp url
      $refs->add_dblp($dblp);
   } else {
      print "<p style='color:red'>DBLP url looks invalid: ", $dblp,"</p>";
   }
 }

 my $pubmed = scalar $q->param('pubmed');
 if (length($pubmed) > 5) {
   $refs->add_pubmed($pubmed);
 }

 $filename = scalar $q->param('bibtex');
 $tmpfilename = $q->tmpFileName($filename);
 open my $fh, "<", $tmpfilename;
 $refs->add_bibtex($fh);
 
 my @values = $q->multi_param('refs');
 foreach my $value (@values) {
   open my $fh, "<", \$value; #NB: CGI has already carried out URL decoding
   $refs->add_fromfile($fh);
 }

 $refs->sethtml;
 print $refs->send_resp;

=head2 post.js

 function GetCellValues(dataTable) {
    var table = document.getElementById(dataTable);
    if (table == null) return;
    var i = 0; var Obj = [];
    var names = table.rows[0];
    for (var r = 1; r < table.rows.length; r++) {
        if (table.rows[r].id == 'cite') {
          var row = table.rows[r].cells;
          var check = table.rows[r].getElementsByTagName('Input');
          if (check.length>0){
            Obj[i] = {};
            for (var c = 3; c < row.length; c++){
              var tag = names.cells[c].textContent;
              Obj[i][tag] =row[c].textContent;
            }
            i = i+1;
          }
        }
    }
    var jsonString = JSON.stringify(Obj);
    document.getElementById('out').innerHTML = document.getElementById('out').innerHTML+jsonString;
    // or POST using ajax
 }

=head1 VERSION
 
Ver 0.15
 
=head1 AUTHOR
 
Doug Leith 
    
=head1 BUGS
 
Please report any bugs or feature requests to C<bug-rrd-db at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Bib-Tools>.  I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
 
=head1 COPYRIGHT
 
Copyright 2015 D.J.Leith.
 
This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License.
 
See http://dev.perl.org/licenses/ for more information.
 
=cut


__END__