Yet Another CPAN Grep

GitHub-RSS/lib/GitHub/RSS.pm

package GitHub::RSS;
use strict;
use 5.010;
use Moo 2;
use Filter::signatures;
use feature 'signatures';
no warnings 'experimental::signatures';
use PerlX::Maybe;

use IO::Socket::SSL;
use Net::GitHub;
use DBI;
use JSON;

use Data::Dumper;

our $VERSION = '0.05';

=head1 NAME

GitHub::RSS - collect data from Github.com for feeding into RSS

=head1 SYNOPSIS

    my $gh = GitHub::RSS->new(
        dbh => {
            dsn => "dbi:SQLite:dbname=$store",
        },
    );

    my $last_updated = $gh->last_check;
    $gh->fetch_and_store( $github_user => $github_repo, $last_updated );
    if( $verbose ) {
        print "Updated from $last_updated to " . $gh->last_check, "\n";
    };

=head1 DESCRIPTION

This module provides a cache database for GitHub issues and scripts to
periodically update the database from GitHub.

This is mainly used for creating an RSS feed from the database, hence the
name.

=head1 METHODS

=head2 C<< ->new >>

  my $gh = GitHub::RSS->new(
      dbh => {
          dsn => 'dbi:SQLite:dbname=db/issues.sqlite',
      },
  );

Constructs a new GitHub::RSS instance

=over 4

=item *

B<gh> - instance of L<Net::GitHub>

=cut

has 'gh' => (
    is => 'ro',
    default => sub( $self ) {
        Net::GitHub->new(
            maybe access_token => $self->token
        ),
    },
);

=item *

B<token_file> - name and path of the JSON-format token file containing the
GitHub API token By default, that file is searched for under the name
C<github.credentials> in C<.>, C<$ENV{XDG_DATA_HOME}>, C<$ENV{USERPROFILE}>
and C<$ENV{HOME}>.

=cut

has 'token_file' => (
    is => 'lazy',
    default => \&_find_gh_token_file,
);

=item *

B<token> - GitHub API token. If this is missing, it will be attempted to read
it from the C<token_file>.

=cut

has 'token' => (
    is => 'lazy',
    default => \&_read_gh_token,
);

=item *

B<default_user> - name of the GitHub user whose repos will be read

=cut

has default_user => (
    is => 'ro',
);

=item *

B<default_repo> - name of the GitHub repo whose issues will be read

=cut

has default_repo => (
    is => 'ro',
);

=item *

B<dbh> - premade database handle or alternatively a hashref containing
the L<DBI> arguments

  dbh => $dbh,

or alternatively

  dbh => {
      user     => 'scott',
      password => 'tiger',
      dsn      => 'dbi:SQLite:dbname=db/issues.sqlite',
  }

=cut

has dbh => (
    is       => 'ro',
    required => 1,
    coerce   => \&_build_dbh,
);

sub _build_dbh( $args ) {
    return $args if ref($args) eq 'DBI::db';
    ref($args) eq 'HASH' or die 'Not a DB handle nor a hashref';
    return DBI->connect( @{$args}{qw/dsn user password options/} );
}

=item *

B<fetch_additional_pages> - number of additional pages to fetch from GitHub.
This is relevant when catching up a database for a repository with many issues.

=back

=cut

has fetch_additional_pages => (
    is => 'ro',
    default => '1',
);

sub _find_gh_token_file( $self, $env=undef ) {
    $env //= \%ENV;

    my $token_file;

    # This should use File::User
    for my $candidate_dir ('.',
                           $ENV{XDG_DATA_HOME},
                           $ENV{USERPROFILE},
                           $ENV{HOME}
    ) {
        next unless defined $candidate_dir;
        if( -f "$candidate_dir/github.credentials" ) {
            $token_file = "$candidate_dir/github.credentials";
            last
        };
    };

    return $token_file
}

sub _read_gh_token( $self, $token_file=undef ) {
    my $file = $token_file // $self->token_file;

    if( $file ) {
        open my $fh, '<', $file
            or die "Couldn't open file '$file': $!";
        binmode $fh;
        local $/;
        my $json = <$fh>;
        my $token_json = decode_json( $json );
        return $token_json->{token};
    } else {
        # We'll run without a known account
        return
    }
}

sub fetch_all_issues( $self,
    $user = $self->default_user,
    $repo = $self->default_repo,
    $since=undef ) {
    my @issues = $self->fetch_issues( $user, $repo, $since );
    my $gh = $self->gh;
    while ($gh->issue->has_next_page) {
        push @issues, $gh->issue->next_page;
    }
    @issues
}

sub fetch_issues( $self,
    $user = $self->default_user,
    $repo = $self->default_repo,
    $since=undef ) {
    my $gh = $self->gh;
    my @issues = $gh->issue->repos_issues($user => $repo,
                                          { sort => 'updated',
                                          direction => 'asc', # so we can interrupt any time
                                          state => 'all', # so we find issues that got closed
                                          maybe since => $since,
                                          }
                                         );
};

=head2 C<< ->fetch_issue_comments >>

=cut

sub fetch_issue_comments( $self, $issue_number,
        $user=$self->default_user,
        $repo=$self->default_repo
    ) {
    # Shouldn't this loop as well, just like with the issues?!
    return $self->gh->issue->comments($user, $repo, $issue_number );
}

sub write_data( $self, $table, @rows) {
    my @columns = sort keys %{ $rows[0] };
    if( ! @columns ) {
        # we have an empty row?!
        return;
    };
    my $statement = sprintf q{replace into "%s" (%s) values (%s)},
                        $table,
                        join( ",", map qq{"$_"}, @columns ),
                        join( ",", ('?') x (0+@columns))
                        ;
    my $sth = $self->dbh->prepare( $statement );
    eval {
        $sth->execute_for_fetch(sub { @rows ? [ @{ shift @rows }{@columns} ] : () }, \my @errors);
    } or die Dumper \@rows;
    #if( @errors ) {
    #    warn Dumper \@errors if (0+@errors);
    #};
}

sub store_issues_comments( $self, $user, $repo, $issues ) {
    # Munge some columns:
    for (@$issues) {
        my $u = $_->{user};
        @{ $_ }{qw( user_id user_login user_gravatar_id )}
            = @{ $u }{qw( id login gravatar_id )};

        # Squish all structure into JSON, for later
        for (values %$_) {
            if( ref($_) ) { $_ = encode_json($_) };
        };
    };

    for my $issue (@$issues) {
        #$|=1;
        #print sprintf "% 6d %s\r", $issue->{number}, $issue->{updated_at};
        my @comments = $self->fetch_issue_comments( $issue->{number}, $user => $repo );

        # Squish all structure into JSON, for later
        for (@comments) {
            for (values %$_) {
                if( ref($_) ) { $_ = encode_json($_) };
            };
        };
        $self->write_data( 'comment' => @comments )
            if @comments;
    };

    # We wrote the comments first so we will refetch them if there is a problem
    # when writing the issue
    $self->write_data( 'issue' => @$issues );
};

=head2 C<< ->fetch_and_store($user, $repo, $since) >>

  my $since = $gh->last_check;
  $gh->fetch_and_store($user, $repo, $since)

Fetches all issues and comments modified after the C<$since> timestamp.
If C<$since> is missing or C<undef>, all issues will be retrieved.

=cut

sub fetch_and_store( $self,
                     $user  = $self->default_user,
                     $repo  = $self->default_repo,
                     $since = undef) {
    my $dbh = $self->dbh;
    my $gh = $self->gh;

    my $can_fetch_more = $self->fetch_additional_pages;

FETCH:
    my @issues = $self->fetch_issues( $user => $repo, $since );
    my $has_more = $gh->issue->has_next_page;
    $self->store_issues_comments( $user => $repo, \@issues );

    if( $has_more and (!defined($can_fetch_more) or $can_fetch_more-- > 0)) {
        $since = $issues[-1]->{updated_at};
        goto FETCH;
    }
}

sub refetch_issues( $self,
                     $user  = $self->default_user,
                     $repo  = $self->default_repo,
                     @issue_numbers) {
    my $dbh = $self->dbh;
    my $gh = $self->gh;

    my @issues = map { scalar $gh->issue->issue($user => $repo, $_) } @issue_numbers;
    $self->store_issues_comments( $user => $repo, \@issues );
}

sub inflate_fields( $self, $item, @fields ) {
    for (@fields) {
        $item->{$_} = $item->{$_} ? decode_json( $item->{$_} ) : $item->{$_};
    }
}

sub issues_and_comments( $self, $since ) {
    map {
        $self->inflate_fields( $_, qw(user closed_by));
        $_
    }
    @{ $self->dbh->selectall_arrayref(<<'SQL', { Slice => {}}, $since, $since) }
        select
               i.id
             , i.user
             , i.html_url
             , i.body
             , i.created_at
             , i.updated_at
             , i.title as issue_title
             , i.number as issue_number
          from issue i
         where i.updated_at >= ?
      union all
        select
               c.id
             , c.user
             , c.html_url
             , c.body
             , c.created_at
             , c.updated_at
             , i.title as issue_title
             , i.number as issue_number
          from comment c
          join issue i on c.issue_url=i.url
         where c.updated_at >= ?
      order by i.updated_at, html_url
SQL
}

sub issues_with_patches( $self ) {
    map {
        $self->inflate_fields( $_, qw(user closed_by));
        $_
    }
    @{ $self->dbh->selectall_arrayref(<<'SQL', { Slice => {}}) }
        select distinct
               i.* -- later, expand to explicit list
          from issue i
          join comment c on c.issue_url=i.url
         where c.body like '%```diff%'
           and i.state = 'open'
      order by i.url
SQL
}

sub issue( $self, $issue ) {
    $self->dbh->selectall_arrayref(<<'SQL', { Slice => {}}, $issue)->[0]
        select
               * -- later, expand to explicit list
          from issue i
         where i.number = ?
      order by i.url
SQL
}

sub comments( $self, $issue ) {
    @{ $self->dbh->selectall_arrayref(<<'SQL', { Slice => {}}, $issue) }
        select
               c.* -- later, expand to explicit list
          from comment c
          join issue i on c.issue_url=i.url
         where i.number = ?
      order by c.url
SQL
}

=head2 C<< ->last_check >>

  my $since = $gh->last_check;

Returns the timestamp of the last stored modification or C<undef>
if no issue or comment is stored.

=cut


sub last_check( $self,
                $user = $self->default_user,
                $repo = $self->default_repo ) {
    my $last = $self->dbh->selectall_arrayref(<<'SQL', { Slice => {} });
        select
            max(updated_at) as updated_at
          from issue
SQL
    if( @$last ) {
        return $last->[0]->{updated_at}
    } else {
        return undef # empty DB
    }
}

1;
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.