EBook-Ishmael/lib/EBook/Ishmael.pm
package EBook::Ishmael;
use 5.016;
our $VERSION = '1.09';
use strict;
use warnings;
use Encode qw(find_encoding encode);
use File::Basename;
use File::Path qw(remove_tree);
use File::Temp qw(tempfile);
use Getopt::Long;
use JSON::PP;
use List::Util qw(max);
use XML::LibXML;
use EBook::Ishmael::EBook;
use EBook::Ishmael::ImageID;
use EBook::Ishmael::TextBrowserDump;
use constant {
MODE_TEXT => 0,
MODE_META => 1,
MODE_ID => 2,
MODE_HTML => 3,
MODE_RAW_TIME => 4,
MODE_COVER => 5,
MODE_IMAGE => 6,
};
# TODO: Temporary files are not cleaned up if ishmael is piped into another
# program like less
# TODO: It would be nice if we had a way to automatically determine an ebook's
# encoding...
$0 =~ s!^.*[/\\]!!;
my $PRGNAM = 'ishmael';
my $PRGVER = $VERSION;
my $HELP = <<"HERE";
$PRGNAM - $PRGVER
Usage:
$0 [options] file [output]
Options:
-d|--dumper=<dumper> Specify dumper to use for formatting text
-e|--encoding=<enc> Print text output in specified encoding
-I|--file-encoding=<enc> Specify ebook character encoding
-f|--format=<format> Specify ebook format
-w|--width=<width> Specify output line width
-N|--no-network Disable fetching remove resources
-t|--text Dump formatted ebook text
-H|--html Dump ebook HTML
-c|--cover Dump ebook cover image
-g|--image Dump ebook images
-i|--identify Identify ebook format
-m|--metadata[=<form>] Print ebook metadata
-r|--raw Dump the raw, unformatted ebook text
-h|--help Print help message
-v|--version Print version/copyright info
HERE
my $VERSION_MSG = <<"HERE";
$PRGNAM - $PRGVER
Copyright (C) 2025 Samuel Young
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
HERE
my $STDIN = '-';
my $STDOUT = '-';
my %FORMAT_ALTS = (
'fb2' => 'fictionbook2',
'azw' => 'mobi',
'azw3' => 'kf8',
);
# TODO: At some point in the future, remove pjson and pxml
my %META_MODES = map { $_ => 1 } qw(
ishmael json pjson xml pxml
);
# Replace characters that cannot be encoded with empty strings.
my $ENC_SUBST = sub { q[] };
# If reading from stdin, write stdin to temporary file and dump that.
sub _get_in_path {
my $file = shift;
if ($file eq $STDIN) {
return do {
my ($h, $p) = tempfile(UNLINK => 1);
binmode $h;
print { $h } do { local $/; <STDIN> };
close $h;
$p;
};
} else {
return $file;
}
}
sub _get_out {
my $file = shift;
if ($file ne $STDOUT) {
open my $fh, '>', $file
or die "Failed to open $file for writing: $!\n";
return $fh;
} else {
return *STDOUT;
}
}
sub init {
my $class = shift;
my $self = {
Ebook => undef,
Mode => MODE_TEXT,
Dumper => $ENV{ISHMAEL_DUMPER},
Encode => $ENV{ISHMAEL_ENCODING},
FileEnc => undef,
Format => undef,
Output => undef,
Width => 80,
Meta => undef,
Network => 1,
};
Getopt::Long::config('bundling');
GetOptions(
'dumper|d=s' => \$self->{Dumper},
'encoding|e=s' => \$self->{Encode},
'file-encoding|I=s' => \$self->{FileEnc},
'format|f=s' => \$self->{Format},
'width|w=i' => \$self->{Width},
'no-network|N' => sub { $self->{Network} = 0 },
'text|t' => sub { $self->{Mode} = MODE_TEXT },
'html|H' => sub { $self->{Mode} = MODE_HTML },
'cover|c' => sub { $self->{Mode} = MODE_COVER },
'image|g' => sub { $self->{Mode} = MODE_IMAGE },
'identify|i' => sub { $self->{Mode} = MODE_ID },
'metadata|m:s' => sub {
# Some DWIMery that if the given argument is not a valid metadata
# format, assume the user meant for it be a file argument and put
# it back into @ARGV.
$self->{Mode} = MODE_META;
if (!$_[1] or exists $META_MODES{ lc $_[1] }) {
$self->{Meta} = lc $_[1] || 'ishmael';
} else {
$self->{Meta} = 'ishmael';
unshift @ARGV, $_[1];
}
},
'raw|r' => sub { $self->{Mode} = MODE_RAW_TIME },
'help|h' => sub { print $HELP; exit 0; },
'version|v' => sub { print $VERSION_MSG; exit 0; },
) or die "Error in command line arguments\n$HELP";
$self->{Ebook} = shift @ARGV or die $HELP;
$self->{Output} = shift @ARGV;
$self->{Ebook} = _get_in_path($self->{Ebook});
if ($self->{Mode} == MODE_COVER) {
$self->{Output} //= (fileparse($self->{Ebook}, qr/\.[^.]*/))[0] . '.-';
} elsif ($self->{Mode} == MODE_IMAGE) {
$self->{Output} //= (fileparse($self->{Ebook}, qr/\.[^.]*/))[0];
} else {
$self->{Output} //= $STDOUT;
}
if (defined $self->{Format}) {
$self->{Format} = lc $self->{Format};
if (exists $FORMAT_ALTS{ $self->{Format} }) {
$self->{Format} = $FORMAT_ALTS{ $self->{Format} };
}
unless (exists $EBOOK_FORMATS{ $self->{Format} }) {
die "$self->{Format} is not a recognized ebook format\n";
}
}
if (defined $self->{Encode} and not defined find_encoding($self->{Encode})) {
die "'$self->{Encode}' is an invalid character encoding\n";
}
if (defined $self->{FileEnc} and not defined find_encoding($self->{FileEnc})) {
die "'$self->{FileEnc}' is an invalid character encoding\n";
}
bless $self, $class;
return $self;
}
sub text {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $tmp = do {
my ($tf, $tp) = tempfile(UNLINK => 1);
close $tf;
$tp;
};
$ebook->html($tmp);
my $oh = _get_out($self->{Output});
unless (defined $self->{Encode}) {
binmode $oh, ':utf8';
}
my $dump = browser_dump(
$tmp,
{
browser => $self->{Dumper},
width => $self->{Width},
}
);
if (defined $self->{Encode}) {
print { $oh } encode($self->{Encode}, $dump, $ENC_SUBST);
} else {
print { $oh } $dump;
}
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub meta {
my $self = shift;
if ($self->{Meta} eq 'pxml' or $self->{Meta} eq 'pjson') {
warn "Using 'pxml' or 'pjson' as a metadata format is deprecated and will be removed in a future release\n";
}
if ($self->{Meta} eq 'ishmael') {
$self->meta_ishmael;
} elsif ($self->{Meta} eq 'json' or $self->{Meta} eq 'pjson') {
$self->meta_json;
} elsif ($self->{Meta} eq 'xml' or $self->{Meta} eq 'pxml') {
$self->meta_xml;
} else {
die "'$self->{Meta}' is not a valid metadata format\n";
}
1;
}
sub meta_ishmael {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my %meta = %{ $ebook->metadata };
my $oh = _get_out($self->{Output});
binmode $oh, ':utf8';
my $klen = max(map { length } keys %meta) + 1;
for my $k (sort keys %meta) {
printf { $oh } "%-*s %s\n", $klen, "$k:", join ", ", @{ $meta{ $k } };
}
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub meta_json {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $meta = $ebook->metadata;
my $oh = _get_out($self->{Output});
for my $k (keys %{ $meta }) {
# Flatten arrays that contain a single item
if (@{ $meta->{ $k } } == 1) {
$meta->{ $k } = $meta->{ $k }->[0];
}
}
my $json = JSON::PP->new->utf8->pretty->canonical;
print { $oh } $json->encode($meta);
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub meta_xml {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $meta = $ebook->metadata;
my $oh = _get_out($self->{Output});
my $dom = XML::LibXML::Document->new('1.0', 'UTF-8');
my $root = XML::LibXML::Element->new('ishmael');
$dom->setDocumentElement($root);
$root->setAttribute('version', $PRGVER);
my $metan = $root->appendChild(
XML::LibXML::Element->new('metadata')
);
for my $k (sort keys %$meta) {
my $n = $metan->appendChild(
XML::LibXML::Element->new(lc $k)
);
for my $i (@{ $meta->{ $k } }) {
my $in = $n->appendChild(
XML::LibXML::Element->new('item')
);
$in->appendChild(
XML::LibXML::Text->new($i)
);
}
}
$dom->toFH($oh, 1);
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub id {
my $self = shift;
my $id = ebook_id($self->{Ebook});
say defined $id ? $id : "Could not identify format for $self->{Ebook}";
1;
}
sub html {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $oh = _get_out($self->{Output});
unless (defined $self->{Encode}) {
binmode $oh, ':utf8';
}
my $html = $ebook->html;
if (defined $self->{Encode}) {
say { $oh } encode($self->{Encode}, $html, $ENC_SUBST);
} else {
say { $oh } $html;
}
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub raw {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $oh = _get_out($self->{Output});
unless (defined $self->{Encode}) {
binmode $oh, ':utf8';
}
my $raw = $ebook->raw;
if (defined $self->{Encode}) {
say { $oh } encode($self->{Encode}, $raw, $ENC_SUBST);
} else {
say { $oh } $raw;
}
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub cover {
my $self = shift;
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
unless ($ebook->has_cover) {
say "$self->{Ebook} does not have a cover";
return;
}
my $cover = $ebook->cover;
my $fmt = image_id(\$cover);
unless (defined $fmt) {
die "Could not dump $self->{Ebook} cover; could not identify cover image format\n";
}
$self->{Output} =~ s/\.-$/.$fmt/;
my $oh = _get_out($self->{Output});
binmode $oh;
print { $oh } $ebook->cover;
close $oh unless $self->{Output} eq $STDOUT;
1;
}
sub image {
my $self = shift;
if ($self->{Output} eq $STDOUT) {
die "Cannot dump images to stdout\n";
}
my $ebook = EBook::Ishmael::EBook->new(
$self->{Ebook},
$self->{Format},
$self->{FileEnc},
$self->{Network},
);
my $num = $ebook->image_num;
unless ($num) {
say "$self->{Ebook} has no images";
return;
}
my $base = basename($self->{Output});
my $pad = length $num;
my $mkdir = 0;
unless (-d $self->{Output}) {
mkdir $self->{Output}
or die "Failed to mkdir $self->{Output}: $!\n";
$mkdir = 1;
}
my @created;
eval {
for my $i (0 .. $num - 1) {
my $ii = $i + 1;
my $img = $ebook->image($i);
my $id = image_id($img);
unless (defined $id) {
warn "Could not identify image #$ii\'s format, skipping\n";
next;
}
my $b = sprintf "%s-%0*d.%s", $base, $pad, $ii, $id;
my $p = File::Spec->catfile($self->{Output}, $b);
open my $fh, '>', $p
or die "Failed to open $p for writing: $!\n";
binmode $fh;
print { $fh } $$img;
close $fh;
push @created, $p;
}
1;
} or do {
for my $c (@created) {
unlink $c;
}
rmdir $self->{Output} if $mkdir;
die $@;
};
unless (@created) {
rmdir $self->{Output} if $mkdir;
die "Could not dump any images in $self->{Output}\n";
}
say $self->{Output};
for my $c (map { basename($_) } @created) {
say " $c";
}
1;
}
sub run {
my $self = shift;
if ($self->{Mode} == MODE_TEXT) {
$self->text;
} elsif ($self->{Mode} == MODE_META) {
$self->meta;
} elsif ($self->{Mode} == MODE_ID) {
$self->id;
} elsif ($self->{Mode} == MODE_HTML) {
$self->html;
} elsif ($self->{Mode} == MODE_RAW_TIME) {
$self->raw;
} elsif ($self->{Mode} == MODE_COVER) {
$self->cover;
} elsif ($self->{Mode} == MODE_IMAGE) {
$self->image;
}
1;
}
1;
=head1 NAME
EBook::Ishmael - EBook dumper
=head1 SYNOPSIS
use EBook::Ishmael;
my $ishmael = EBook::Ishmael->init();
$ishmael->run();
=head1 DESCRIPTION
B<EBook::Ishmael> is the workhorse module for L<ishmael>. If you're looking for
user documentation, you should consult its manual instead of this (this is
developer documentation).
=head1 METHODS
=head2 $i = EBook::Ishmael->init()
Reads C<@ARGV> and returns a blessed C<EBook::Ishmael> object. Consult the
manual for L<ishmael> for a list of options that are available.
=head2 $i->text()
Dumps ebook file to text, default run mode.
=head2 $i->meta()
Dumps ebook metadata, C<--metadata> mode.
=head2 $i->meta_ishmael()
Dumps ebook metadata, C<--metadata=ishmael> mode.
=head2 $i->meta_json()
Dumps ebook metadata in JSON form, C<--metadata=json> mode.
=head2 $i->meta_xml()
Dumps ebook metadata in XML form, C<--metadata=xml> mode.
=head2 $i->id()
Identify the format of the given ebook, C<--identify> mode.
=head2 $i->html()
Dump the HTML-ified contents of a given ebook, C<--html> mode.
=head2 $i->raw()
Dump the raw, unformatted text contents of a given ebook, C<--raw> mode.
=head2 $i->cover()
Dump the binary data of the cover image of a given ebook, if one is present,
C<--cover> mode.
=head2 $i->run()
Runs L<ishmael> based on the parameters processed during C<init()>.
=head1 AUTHOR
Written by Samuel Young, E<lt>samyoung12788@gmail.comE<gt>.
This project's source can be found on its
L<Codeberg Page|https://codeberg.org/1-1sam/ishmael>. Comments and pull
requests are welcome!
=head1 COPYRIGHT
Copyright (C) 2025 Samuel Young
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
=head1 SEE ALSO
L<ishmael>
=cut