Group
Extension

Bib-CrossRef/lib/Bib/CrossRef.pm

############################################################
#
#   Bib::CrossRef - Uses crossref to robustly parse bibliometric references.
#
############################################################

package Bib::CrossRef;

use 5.8.8;
use strict;
use warnings;
no warnings 'uninitialized';

require Exporter;
use LWP::UserAgent;
use JSON qw/decode_json/;
use URI::Escape qw(uri_escape_utf8 uri_unescape);
use HTML::Entities qw(decode_entities encode_entities);
use XML::Simple;
use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS @ISA);

#use Data::Dumper;

$VERSION = '0.12';
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(
sethtml clearhtml parse_text parse_doi print printheader printfooter
doi score date atitle jtitle volume issue genre spage epage authcount auth query
);
%EXPORT_TAGS = (all => \@EXPORT_OK);
# disable SSL cert checking
$ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0;

sub new {
    my $self;
    $self->{html} = 0; # use html for error messages ?
    $self->{ref} = {}; # the reference itself
    bless $self;
    return $self;
}

sub sethtml {
  $_[0]->{html} = 1;
}

sub clearhtml {
  $_[0]->{html} = 0;
}

sub _err {
  my ($self, $str) = @_;
  if ($self->{html}) {
    print "<p style='color:red'>",$str,"</p>";
  } else {
    print $str,"\n";
  }
}

sub doi {
  my $self = shift @_;
  return $self->{ref}->{'doi'};
}

sub _setdoi {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'doi'}=$val;
}

sub url {
  my $self = shift @_;
  return $self->{ref}->{'url'};
}

sub _seturl {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'url'}=$val;
}

sub score {
  my $self = shift @_;
  return $self->{ref}->{'score'};
}

sub _setscore {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'score'}=$val;
}

sub atitle {
  my $self = shift @_;
  return $self->{ref}->{'atitle'};
}

sub _setatitle {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'atitle'}=$val;
}

sub jtitle {
  my $self = shift @_;
  return $self->{ref}->{'jtitle'};
}

sub _setjtitle {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'jtitle'}=$val;
}

sub volume {
  my $self = shift @_;
  return $self->{ref}->{'volume'};
}

sub _setvolume {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'volume'}=$val;
}

sub issue {
  my $self = shift @_;
  return $self->{ref}->{'issue'};
}

sub _setissue {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'issue'}=$val;
}

sub date {
  my $self = shift @_;
  return $self->{ref}->{'date'};
}

sub _setdate {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'date'}=$val;
}

sub genre {
  my $self = shift @_;
  return $self->{ref}->{'genre'};
}

sub _setgenre {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'genre'}=$val;
}

sub spage {
  my $self = shift @_;
  return $self->{ref}->{'spage'};
}

sub _setspage {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'spage'}=$val;
}

sub epage {
  my $self = shift @_;
  return $self->{ref}->{'epage'};
}

sub _setepage {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'epage'}=$val;
}

sub authcount {
  my $self = shift @_;
  return $self->{ref}->{'authcount'};
}

sub _setauthcount {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'authcount'}=$val;
}

sub auth {
  my ($self, $num) = @_;
  return $self->{ref}->{'au'.$num};
}

sub _setauth {
  my $self = shift @_;
  my $i = shift @_;
  my $val = shift @_;
  $self->{ref}->{'au'.$i}=$val;
}

sub query {
  my $self = shift @_;
  return $self->{ref}->{'query'};
}

sub _setquery {
  my $self = shift @_;
  my $val = shift @_;
  $self->{ref}->{'query'}=$val;
}

sub parse_text {
  # given free format text, use crossref.org to try to convert into a paper reference and doi
  my ($self, $cites) = @_;
  
  my $cites_clean = $cites;
  # tidy up string, escape nasty characters etc.
  $cites_clean =~ s/\s+/+/g; #$cites_clean = uri_escape_utf8($cites_clean);
  #print $cites_clean;
  # crossref like us to give a mailto email when making request so they can get in touch if the script is generating errors,
  # feel free to change the email address here to something more appropriate
  # change to using /works API instead of /dois API
  my $req = HTTP::Request->new(GET => 'https://api.crossref.org/works?mailto=doug@leith.ie&rows=1&query='.$cites_clean);
  my $ua = LWP::UserAgent->new;
  my $res = $ua->request($req);
  if ($res->is_success) {
    # extract json response
    my $json = decode_json($res->decoded_content);
    #use Data::Dumper;
    #print Dumper($json->{'message'}{'items'});
    my $ref={};
    # keep a record of the query string we used
    $ref->{'query'} = $cites;
    my $response = $json->{'message'}{'items'}[0];
    #print Dumper($response);
    # extract doi and matching score
    $ref->{'doi'} = $response->{'DOI'};
    $ref->{'doi'} =~ s/http:\/\/dx.doi.org\///;  # remove any http header
    $ref->{'url'} = $response->{'URL'};
    $ref->{'score'} = $response->{'score'}; #$json->[0]{'normalizedScore'};
    $ref->{'issue'} = $response->{'journal-issue'}{'issue'};
    $ref->{'genre'} = $response->{'type'};
    if ($ref->{'genre'} =~ m/journal-article/) { $ref->{'genre'}='article';} # for backward compatibility
    #print $ref->{'genre'};
    $ref->{'jtitle'} = $response->{'container-title'}[0];
    $ref->{'atitle'} = $response->{'title'}[0];
    $ref->{'volume'} = $response->{'volume'};
    my @bits = split /-/, $response->{'page'};
    $ref->{'spage'} = $bits[0];
    $ref->{'epage'} = $bits[1];
    $ref->{'date'} = $response->{'issued'}{'date-parts'}[0][0];
    $ref->{'authcount'}=0;
    while (defined $response->{'author'}[$ref->{'authcount'}]) {
       my $val = $response->{'author'}[$ref->{'authcount'}];
       #print Dumper($val);
       $ref->{'authcount'}++;
       $ref->{'au'.$ref->{'authcount'}} = $val->{'given'}.' '.$val->{'family'};
    }
    
    $self->{ref} = $ref;
  } else {
    $self->_err("Problem with search.crossref.org: ".$res->status_line);
  }
}

sub parse_doi {
  # given a DOI, use unixref interface to convert into a full citation
  my ($self, $doi) = @_;
  
  my $req = HTTP::Request->new(GET =>'http://dx.doi.org/'.$doi,['Accept' =>'application/vnd.crossref.unixsd+xml']);
  my $ua = LWP::UserAgent->new;
  my $res = $ua->request($req);
  if ($res->is_success) {
    # now parse the xml
    my $xs = XML::Simple->new();
    my $data = $xs->XMLin($res->decoded_content);
    my $cite =  $data->{'query_result'}->{'body'}->{'query'}->{'doi_record'}->{'crossref'};
    my $cc = undef;
    if (exists($cite->{'conference'})) {
      $self->_setgenre('proceeding');
      if (exists($cite->{'conference'}->{'proceedings_metadata'})) {
        $self->_setjtitle($cite->{'conference'}->{'proceedings_metadata'}->{'proceedings_title'});
        if (exists $cite->{'conference'}->{'proceedings_metadata'}->{'publication_date'}) {$self->_setdate($cite->{'conference'}->{'proceedings_metadata'}->{'publication_date'}->{'year'});}
      } else {
        $self->_setjtitle($cite->{'conference'}->{'proceedings_series_metadata'}->{'series_metadata'}->{'proceedings_title'});
        $self->_setvolume($cite->{'conference'}->{'proceedings_series_metadata'}->{'series_metadata'}->{'volume'});
        if (exists $cite->{'conference'}->{'proceedings_series_metadata'}->{'publication_date'}) {$self->_setdate($cite->{'conference'}->{'proceedings_series_metadata'}->{'publication_date'}->{'year'});}
      }
      $cc = $cite->{'conference'}->{'conference_paper'};
    } elsif (exists($cite->{'journal'})) {
      $self->_setgenre('article');
      $self->_setjtitle($cite->{'journal'}->{'journal_metadata'}->{'full_title'});
      $cc = $cite->{'journal'}->{'journal_issue'};
      if (exists($cc->{'journal_volume'})) {$self->_setvolume($cc->{'journal_volume'}->{'volume'});}
      if (exists($cc->{'issue'})) {$self->_setissue($cc->{'issue'});}
      $cc = $cite->{'journal'}->{'journal_article'};
    } elsif (exists($cite->{'book'})) {
      if ($cite->{'book'}->{'book_type'} ne 'other' ) {
        $self->_setgenre('book');
      } else {
        $self->_setgenre('bookitem');
      }
      my $jtitle = '';
      if (exists($cite->{'book'}->{'book_series_metadata'})) {
        if (exists($cite->{'book'}->{'book_series_metadata'}->{'titles'}->{'title'})) {
          $jtitle .= $cite->{'book'}->{'book_series_metadata'}->{'titles'}->{'title'}.': ';
        }
        $jtitle .= $cite->{'book'}->{'book_series_metadata'}->{'series_metadata'}->{'titles'}->{'title'};
        $self->_setvolume($cite->{'book'}->{'book_series_metadata'}->{'volume'});
        if (exists $cite->{'book'}->{'book_series_metadata'}->{'publication_date'}) {$self->_setdate($cite->{'book'}->{'book_series_metadata'}->{'publication_date'}->{'year'});}
      } elsif (exists($cite->{'book'}->{'book_metadata'})) {
        $jtitle .= $cite->{'book'}->{'book_metadata'}->{'titles'}->{'title'};
        if (exists($cite->{'book'}->{'book_metadata'}->{'series_metadata'}->{'titles'}->{'title'})) {
          $jtitle .= ": ".$cite->{'book'}->{'book_metadata'}->{'series_metadata'}->{'titles'}->{'title'};
          if (exists $cite->{'book'}->{'book_metadata'}->{'series_metadata'}->{'volume'}) {$self->_setvolume($cite->{'book'}->{'book_metadata'}->{'series_metadata'}->{'volume'});}
        }
        if (exists $cite->{'book'}->{'book_metadata'}->{'volume'}) {$self->_setvolume($cite->{'book'}->{'book_metadata'}->{'volume'});}
        if (exists $cite->{'book'}->{'book_metadata'}->{'publication_date'}) {$self->_setdate($cite->{'book'}->{'book_metadata'}->{'publication_date'}->{'year'});}
      } else {
        if (exists($cite->{'book'}->{'book_set_metadata'}->{'titles'}->{'title'})) {
          $jtitle .= $cite->{'book'}->{'book_set_metadata'}->{'titles'}->{'title'}.': ';
        }
        $jtitle .= $cite->{'book'}->{'book_set_metadata'}->{'set_metadata'}->{'titles'}->{'title'};
      }
      $self->_setjtitle($jtitle);
      $cc = $cite->{'book'}->{'content_item'};
    } else {
        # something else -- might be dissertation, report-paper, standard, sa-component, database
        # fall back to alternative interface for now
        $self->parse_text($doi);
        return; # stop here
    }
    $self->_setscore(1);
    $self->_setquery($doi);
    $self->_setdoi($doi);
    if (!defined $cc) {
      # seems like an incomplete entry
      return;
    }
    #$self->_setatitle($cc->{'titles'}->{'title'});
    my $title;
    if  (ref $cc->{titles} ne "HASH") {
      $title =  $cc->{titles}->[0];
    } else {
      $title = $cc->{'titles'}->{'title'};
    }
    $self->_setatitle($title);
    $self->_setdoi($cc->{'doi_data'}->{'doi'});
    if (ref($cc->{'publication_date'}) eq "HASH") {
      $self->_setdate($cc->{'publication_date'}->{'year'});
    } else { # we have multiple dates, lets try and pick put the print date
      my $found = 0; my $count=0;
      foreach my $d (@{$cc->{'publication_date'}}) {
        if ($d->{'media_type'} eq 'print') {
          $self->_setdate($d->{'year'}); $found = 1;
        }
        $count++;
      }
      if ((!$found) && ($count>0)) {$self->_setdate(${$cc->{'publication_date'}}[0]->{'year'});}
    }
    if (exists(${$cc}{'pages'})) {
      if (exists(${$cc->{'pages'}}{'first_page'})) {$self->_setspage($cc->{'pages'}->{'first_page'});}
      if (exists(${$cc->{'pages'}}{'last_page'})) {$self->_setepage($cc->{'pages'}->{'last_page'});}
    }
    $cc = $cc->{'contributors'}->{'person_name'};
    if (ref($cc) eq "HASH") {
      $self->_setauthcount(1);
      $self->_setauth(1,$cc->{'given_name'}.' '.$cc->{'surname'});
    } else {
      my $count = 0;
      foreach my $au (@{$cc}) {
        if ($au->{'contributor_role'} ne 'author') {next;}
        $count++;
        $self->_setauth($count, $au->{'given_name'}.' '.$au->{'surname'});
      }
      $self->_setauthcount($count);
    }
  } else {
    $self->_err("Problem with search.crossref.org/guestquery: ".$res->status_line);
  }
}

sub printheader {
  my $self = shift @_;
  my $id = shift @_;
  my $str = '';
  if (defined $id) {$str = 'id="'.$id.'"';}
  return  '<table '.$str.'><tr style="font-weight:bold"><td></td><td>Use</td><td></td><td>Type</td><td>Year</td><td>Authors</td><td>Title</td><td>Journal</td><td>Volume</td><td>Issue</td><td>Pages</td><td>DOI</td><td>url</td><td></td></tr>'."\n";
}

sub printfooter {
  return "</table>\n";
}

sub _authstring {
  my $self = shift @_;
  my $out='';
  if ($self->authcount > 0) {
    $out = $self->auth(1);
    for (my $j = 2; $j <= $self->authcount; $j++) {
      $out.=' and '.$self->auth($j);
    }
  }
  return $out;
}

sub print {
  # return a reference in human readable form
  my ($self, $id, $add) = @_;
  my $ref = $self->{ref};
  if (!defined $id) {$id='';}
  if (!defined $add) {$add='';}
  
  my $out='';
  if ($self->{html}) {
    $out.=sprintf "%s", '<tr id="cite">';
    $out.=sprintf "%s",  '<td>'.$id.'</td>';
    if ($self->score<1) {
      $out.=sprintf "%s",  '<td><input type="checkbox" name="'.$id.'" value=""></td>';
      $out.=sprintf "%s",  '<td style="color:red">Poor match</td>';
    } else {
      $out.=sprintf "%s",  '<td><input type="checkbox" name="'.$id.'" value="" checked></td><td></td>';
    }
    $out.=sprintf "%s",  '<td contenteditable="true">'.$self->genre.'</td><td contenteditable="true">'.$self->date.'</td>';
    $out.=sprintf "%s",  '<td contenteditable="true">'.$self->_authstring.'</td>';
    $out.=sprintf "%s",  '<td contenteditable="true">'.$self->atitle.'</td><td contenteditable="true">'.encode_entities($self->jtitle).'</td>';
    $out.=sprintf "%s",  '<td contenteditable="true">';
    if (defined $self->volume) {
      $out.=sprintf "%s",  $self->volume;
    }
    $out.=sprintf "%s",  '</td><td contenteditable="true">';
    if (defined $self->issue) {
      $out.=sprintf "%s",  $self->issue;
    }
    $out.=sprintf "%s",  '</td><td contenteditable="true">';
    if (defined $self->spage) {
      $out.=sprintf "%s",  $self->spage;
    }
    if (defined $self->epage) {
      $out.=sprintf "%s",  '-'.$self->epage;
    }
    $out.=sprintf "%s",  '</td><td contenteditable="true">';
    if (defined $self->doi) {
      my $doi = $self->doi;
      $doi =~ s/http:\/\/dx.doi.org\///;
      $out.=$doi;
    }
    $out.=sprintf "%s",  '</td><td contenteditable="true">';
    if (defined $self->url) {
      $out.=sprintf "%s",  '<a href='.$self->url.'>'.$self->url.'</a>';
    }
    $out.= '</td><td>'.$add;
    $out.=sprintf "%s",  '</td></tr>'."\n";
    $out.=sprintf "%s",  '<tr><td colspan=12 style="color:#C0C0C0">'.encode_entities($self->query).'</td></tr>'."\n";
  } else {
    if (length($id)>0) {$out .= $id.". ";}
    if ($self->score<1) {
      $out.=sprintf "%s",  'Poor match (score='.$self->score."):\n";
      $out.=sprintf "%s", $self->query."\n";
    } else {
      #print "$count. ";
    }
    $out.=sprintf "%s",  $self->genre.': '.$self->date.", ".$self->_authstring.", ";
    $out.=sprintf "%s",  "\'".$self->atitle."\'. ".$self->jtitle;
    if (defined $self->volume) {
      $out.=sprintf "%s",  ", ".$self->volume;
      if (defined $self->issue) {
        $out.=sprintf "%s",  "(".$self->issue.")";
      }
    }
    if (defined $self->spage) {
      $out.=sprintf "%s",  ",pp".$self->spage;
    }
    if (defined $self->epage) {
      $out.=sprintf "%s",  '-'.$self->epage;
    }
    if (defined $self->doi) {
      my $doi = $self->doi;
      $doi =~ s/http:\/\/dx.doi.org\///;
      $out.=sprintf "%s",  ", DOI: ".$doi;
    }
    if (defined $self->url) {
      $out.=sprintf "%s",  ", ".$self->url;
    }
  }
  return $out;
}

1;

=pod
 
=head1 NAME
 
Bib::CrossRef - Uses crossref to robustly parse bibliometric references.
 
=head1 SYNOPSIS

 use strict;
 use Bib::CrossRef;

# Create a new object

 my $ref = Bib::CrossRef->new();

# Supply some details, Bib::CrossRef will do its best to use this to derive full citation details e.g. the DOI of a document ...

 $ref->parse_text('10.1109/jstsp.2013.2251604');
 
# Show the full citation details, in human readable form

 print $ref->print();

 article: 2013, Alessandro Checco and Douglas J. Leith, 'Learning-Based Constraint Satisfaction With Sensing Restrictions'. IEEE Journal of Selected Topics in Signal Processing, 7(5),pp811-820, DOI: http://dx.doi.org/10.1109/jstsp.2013.2251604

# Show the full citation details, in html format

 $ref->sethtml;
 print $ref->printheader;
 print $ref->print;
 print $ref->printfooter;


=head1 EXAMPLES

A valid DOI will always be resolved to a full citation
e.g.

 $ref->sparse_text('10.1109/jstsp.2013.2251604');
 print $ref->print();
 
 article: 2013, Alessandro Checco and Douglas J. Leith, 'Learning-Based Constraint Satisfaction With Sensing Restrictions'. IEEE Journal of Selected Topics in Signal Processing, 7(5),pp811-820, DOI: http://dx.doi.org/10.1109/jstsp.2013.2251604

An attempt will be made to resolve almost any text containing citation info 
e.g. article title only

 $ref->parse_text('Learning-Based Constraint Satisfaction With Sensing Restrictions');

e.g. author and journal

 $ref->parse_text('Alessandro Checco, Douglas J. Leith, IEEE Journal of Selected Topics in Signal Processing, 7(5)');

Please bear in mind that crossref provides a great service for free -- don't abuse it by making excessive queries.  If making many queries, be
sure to rate limit them to a sensible level or you will likely get blocked.

=head1 METHODS
 
=head2 new

 my $ref = Bib::CrossRef->new();

Creates a new Bib::CrossRef object

=head2 parse_text

 $ref->parse_text($string)

Given a text string, Bib::CrossRef will try to resolve into a full citation with the help of crossref.org

=head2 parse_doi

 $ref->parse_doi($doi)

Given a string containing a DOI e.g. 10.1109/tnet.2012.2202686, resolve into a full citation with the unixref
interface of crossref.org.  This should be definitive publishers DOI and is usually
much as the same as calling $ref->parse_text($doi), but is included completeness.

=head2 doi

 my $info = $ref->doi

Returns a string containg the DOI (digital object identifier) field from a full citation.  If present, this 
should be unique to the document.

=head2 score

 my $info = $ref->score

Returns a matching score from crossref.org.  If less than 1, the text provided to set_details() was likely
insufficient to allow the correct full citation to be obtained.

=head2 genre

 my $info = $ref->genre

Returns the type of publication e.g. jounal paper, conference paper etc

=head2 date

 my $info = $ref->date

Returns the year of publication

=head2 atitle

 my $info = $ref->atitle

Returns the article title

=head2 jtitle

 my $info = $ref->jtitle

Returns the name of the journal (in long form)

=head2 authcount

 my $info = $ref->authcount

Returns the number of authors

=head2 auth

 my $info = $ref->auth($num)

Get the name of author number $num (first author is $ref->auth(1))

=head2 volume

 my $info = $ref->volume

Returns the volume number in which paper appeared

=head2 issue

 my $info = $ref->issue

Returns the issue number in which paper appeared

=head2 spage

 my $info = $ref->spage

Returns the start page

=head2 epage

 my $info = $ref->epage

Returns the end page

=head2 url

Return the url, if any

=head2 query

 my $info = $ref->query
 
Returns the free form string from which full citation is derived

=head2 print

 print $ref->printheader;

Prints full citation in human readable form.

=head2 sethtml

 $ref->sethtml

Set output format to be html

=head2 clearhtml

 $ref->clearhtml

Set output format to be plain text

=head2 printheader

 print $ref->printheader;

When html formatting is enabled, prints some html header tags

=head2 printfooter

 print $ref->printfooter;

When html formatting is enabled, prints some html footer tags

=head1 EXPORTS
 
You can export the following functions if you do not want to use the object orientated interface:

parse_text parse_doi sethtml clearhtml set_details print printheader printfooter
doi score date atitle jtitle volume issue genre spage epage authcount auth query url

The tag C<all> is available to easily export everything:
 
use Bib::CrossRef qw(:all);

=head1 VERSION
 
Ver 0.09
 
=head1 AUTHOR
 
Doug Leith 
    
=head1 BUGS
 
Please report any bugs or feature requests to C<bug-rrd-db at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Bib-CrossRef>.  I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
 
=head1 COPYRIGHT
 
Copyright 2015 D.J.Leith.
 
This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License.
 
See http://dev.perl.org/licenses/ for more information.
 
=cut


__END__


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.