Yet Another CPAN Grep

Dezi-App/lib/Dezi/Utils.pm

package Dezi::Utils;
use Moose;
use Carp;
use Data::Dump qw( dump );
use File::Basename;
use Search::Tools::XML;
use SWISH::3 qw( :constants );

# this class differs from SWISH::Prog::Utils chiefly in that
# it uses SWISH::3::Config rather than hardcoding mime types
# and parser mappings. This is to ensure consistency with
# the SWISH::3 parser used in Indexer and Aggregator.

# singletons
my $SWISH3 = SWISH::3->new();
my $XML    = Search::Tools::XML->new;

our $VERSION = '0.016';

=pod

=head1 NAME

Dezi::Utils - utility variables and methods

=head1 SYNOPSIS

 use Dezi::Utils;

 my $ext = Dezi::Utils->get_file_ext( $filename );
 my $mime = Dezi::Utils->get_mime( $filename );
 if (Dezi::Utils->looks_like_gz( $filename )) {
     $mime = Dezi::Utils->get_real_mime( $filename );
 }
 my $parser = Dezi::Utils->get_parser_for_mime( $mime );

=head1 DESCRIPTION

This class provides commonly used variables and methods
shared by many classes in the Dezi project.

=head1 VARIABLES

=over

=item $ExtRE

Regular expression of common file type extensions.

=item %ParserTypes

Hash of MIME types to their equivalent parser. This hash is
used to cache lookups in get_parser_for_mime().
You really don't want to mess with this, but documented
in case you're brave or foolish.

=item $DefaultExtension

Defaults to C<html>.

=item $DefaultMIME

Defaults to C<text/html>.

=back

=cut

our $ExtRE            = qr{\.(\w+)(\.gz)?$}io;
our %ParserTypes      = ();
our $DefaultExtension = 'html';
our $DefaultMIME      = 'text/html';

# internal cache to avoid hitting SWISH::3 each time
# and to map common extensions that SWISH::3 may not define
my %ext2mime = (
    doc  => 'application/msword',
    pdf  => 'application/pdf',
    ppt  => 'application/vnd.ms-powerpoint',
    html => 'text/html',
    htm  => 'text/html',
    txt  => 'text/plain',
    text => 'text/plain',
    xml  => 'application/xml',
    mp3  => 'audio/mpeg',
    gz   => 'application/x-gzip',
    xls  => 'application/vnd.ms-excel',
    zip  => 'application/zip',
    json => 'application/json',
    yml  => 'application/x-yaml',
    php  => 'text/html',

);

=head1 METHODS

=head2 get_mime( I<url> [, I<swish3>] )

Returns MIME type for I<url>, using optional I<swish3> instance to look it up.
If I<swish3> is missing, will use the L<SWISH::3> default mapping.

=cut

sub get_mime {
    my $self = shift;
    my $url  = shift;
    confess "url required" unless defined $url;
    my $s3 = shift;
    if ($s3) {

        if ( !$s3->isa('SWISH::3') ) {
            confess "s3 object must be instance of SWISH::3, not " . ref($s3);
        }

        # look it up
        my $ext = $s3->get_file_ext($url) || $DefaultExtension;
        return
               $s3->get_mime($url)
            || $ext2mime{$ext}
            || $DefaultMIME;
    }
    else {
        # check our cache first
        my $ext = $SWISH3->get_file_ext($url) || $DefaultExtension;
        if ( exists $ext2mime{$ext} ) {
            return $ext2mime{$ext};
        }

        # no cache? look it up and cache
        my $mime = $SWISH3->get_mime($url);
        $ext2mime{$ext} = $mime;
        return $mime || $DefaultMIME;
    }
}

=head2 mime_type( I<url> [, I<ext> ] )

Backcompat for SWISH::Prog::Utils. Use get_mime() instead,
which is what this does internally.

=cut

sub mime_type {
    my $self = shift;
    my $url = shift or return;
    return $self->get_mime($url);
}

=head2 get_parser_for_mime( I<mime> [, I<swish3_object>] )

Returns the SWISH::3 parser type for I<mime>. This can be
configured via the C<%ParserTypes> class variable.

=cut

sub get_parser_for_mime {
    my $self = shift;
    my $mime = shift;
    confess "mime required" unless defined($mime);
    my $s3 = shift;
    if ($s3) {
        return
               $s3->config->get_parsers->get($mime)
            || $s3->config->get_parsers->get( SWISH_DEFAULT_PARSER() )
            || $ParserTypes{$mime};
    }
    else {
        return $ParserTypes{$mime} if exists $ParserTypes{$mime};
        $ParserTypes{$mime} = $SWISH3->config->get_parsers->get($mime)
            || $SWISH3->config->get_parsers->get( SWISH_DEFAULT_PARSER() );
        return $ParserTypes{$mime};
    }
}

=head2 parser_for( I<url> )

Backcompat for SWISH::Prog::Utils. Use get_parser_for_mime() instead,
which is what this does internally.

=cut

sub parser_for {
    my $self = shift;
    my $url  = shift;
    confess "url required" unless defined($url);
    return $self->get_parser_for_mime( $self->get_mime($url) );
}

=head2 path_parts( I<url> [, I<regex> ] )

Returns array of I<path>, I<file> and I<extension> using the
File::Basename module. If I<regex> is missing or false,
uses $ExtRE.

=cut

sub path_parts {
    my $self = shift;
    my $url  = shift;
    my $re   = shift || $ExtRE;

    # TODO build regex from ->config
    my ( $file, $path, $ext ) = fileparse( $url, $re );
    return ( $path, $file, $ext );
}

=head2 merge_swish3_config( I<key> => I<value> [, I<swish3>] )

The L<SWISH::3> class currently does not allow for modification
of the internal C structs from Perl space. Instead,
the SWISH::3::Config->merge method can be used to parse
XML strings. Since hand-crafting XML is tedious,
this method eases the pain.

I<key> should be a SWISH::3::Config reserved word. Use
the SWISH::3::Constants for safety.

I<value> is passed through perl_to_xml().
If I<value> is a hashref, it should be a simple key/value set with strings.
You may use arrayref values, where items in the array are strings.

The optional I<swish3> object is modified, or the internal
singleton SWISH::3 object will be modified if I<swish3>
is missing.

Example:

 use SWISH::3 qw( :constants );
 $utils->merge_swish3_config(
     SWISH_PARSERS() => {
         'XML'  => [ 'application/x-bar', 'application/x-foo' ],
         'HTML' => [ 'application/x-blue', 'application/x-red' ]
     }
 );
 $utils->merge_swish3_config(
     'foo' => 'bar'
 );
 $utils->get_parser_for_mime( 'application/x-foo' );   # returns 'XML'

=cut

sub merge_swish3_config {
    my $self    = shift;
    my $key     = shift or confess "key required";
    my $hashref = shift or confess "hashref required";
    my $s3      = shift || $SWISH3;
    my $xml     = $XML->perl_to_xml( { $key => $hashref },
        { root => 'swish', wrap_array => 0 } );

    #warn "xml=" . $XML->tidy($xml) . "\n";
    $s3->config->merge($xml);
    return $xml;
}

=head2 get_swish3

Returns the class singleton.

=cut

sub get_swish3 {$SWISH3}

=head2 perl_to_xml( I<ref>, I<root_element> [, I<strip_plural> ] )

Similar to the XML::Simple XMLout() feature, perl_to_xml()
will take a Perl data structure I<ref> and convert it to XML,
using I<root_element> as the top-level element.

As of version 0.38 this method is now part of Search::Tools
and included here simply as a backcompat feature.

=cut

sub perl_to_xml {
    my $self = shift;
    return $XML->perl_to_xml(@_);
}

=head2 write_log( I<args> )

Logging method. By default writes to stderr via warn().

I<args> is a key/value pair hash, with keys B<uri> and B<msg>.

=cut

sub write_log {
    my $self = shift;
    my %args = @_;
    my $uri  = delete $args{uri} or croak "uri required";
    my $msg  = delete $args{msg} or croak "msg required";
    warn sprintf( "[%s][%s] %s [%s]\n", scalar localtime(), $$, $uri, $msg );
}

=head2 write_log_line([I<char>, I<width>])

Writes I<char> x I<width> to stderr, to provide some visual separation when viewing logs.
I<char> defaults to C<-> and I<width> to C<80>.

=cut

sub write_log_line {
    my $self  = shift;
    my $char  = shift || '-';
    my $width = shift || 80;
    warn $char x $width, "\n";
}

__PACKAGE__->meta->make_immutable;

1;

__END__

=head1 AUTHOR

Peter Karman, E<lt>karpet@dezi.orgE<gt>

=head1 BUGS

Please report any bugs or feature requests to C<bug-dezi-app at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Dezi::Utils

You can also look for information at:

=over 4

=item * Website

L<http://dezi.org/>

=item * IRC

#dezisearch at freenode

=item * Mailing list

L<https://groups.google.com/forum/#!forum/dezi-search>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Dezi-App>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Dezi-App>

=item * Search CPAN

L<https://metacpan.org/dist/Dezi-App/>

=back

=head1 COPYRIGHT AND LICENSE

Copyright 2018 by Peter Karman

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<http://dezi.org/>, L<http://swish-e.org/>, L<http://lucy.apache.org/>
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.