FB3-Convert/bin/fb3_2_json.pl
#!/usr/local/bin/perl
use strict;
use XML::LibXML;
use OPC::Node;
use FB3;
use utf8;
use Encode;
use open qw(:std :utf8);
use Image::Info;
use JSON::Path;
use JSON::PP;
use File::Copy;
use File::Basename;
use MIME::Base64;
use TeX::Hyphen;
use Getopt::Long;
my $FB3 = '';
my $Out = '';
my $Version = '1.0';
my $Lang = 'ru';
my $ArtID = undef;
my $Dictionary = undef;
my $Hyp = undef;
GetOptions ('in|from|src|fb3=s' => \$FB3,
'out|to|dst|json=s' => \$Out,
'lang=s' => \$Lang,
'art|art-id=s' => \$ArtID,
'dict=s' => \$Dictionary,
'version=s' => \$Version) or print join('', <DATA>) and die("Error in command line arguments\n");
print join('', <DATA>) and die "ERROR: source directory not specified, use --fb3 parameter\n" unless $FB3;
print join('', <DATA>) and die "ERROR: destination directory not specified, use --json parameter\n" unless $Out;
die "\nERROR: source directory `$FB3' not found\n" unless -d $FB3;
die "\nERROR: destination directory `$Out' not found\n" unless -d $Out;
$Out = $Out.'/' unless $Out =~ /\/$/;
unless ($Version =~ /^\d+\.\d+$/) {
$Version = ($Version =~ /^\d+$/) ? "1.$Version" : "1.0"
}
my %HyphenMin = (
'ru' => [2,2],
'uk' => [2,2],
'pl' => [2,2],
'en' => [2,3],
'de' => [2,2],
'fr' => [2,3],
'es' => [2,2],
'et' => [2,3],
'ka' => [2,2],
'lt' => [2,2],
'lv' => [2,2],
);
my $CannotHyph;
if ($Dictionary) {
if (-e $Dictionary) {
$Hyp = new TeX::Hyphen 'file' => $Dictionary,
'style' => 'utf8', leftmin => ($HyphenMin{$Lang}[0] ? $HyphenMin{$Lang}[0] : 2),
rightmin => ($HyphenMin{$Lang}[1] ? $HyphenMin{$Lang}[1] : 2);
} else {
die "\nERROR: dictionary file `$Dictionary' not found\n"
}
} elsif ($Lang eq 'pl') {
$CannotHyph = 1;
}
my $PartLimit = 20000;
my $IsTrial = 0;
use constant {
RELATION_TYPE_FB3_BOOK =>
'http://www.fictionbook.org/FictionBook3/relationships/Book',
RELATION_TYPE_FB3_BODY =>
'http://www.fictionbook.org/FictionBook3/relationships/body',
RELATION_TYPE_OPC_THUMBNAIL =>
'http://schemas.openxmlformats.org/package/2006/relationships/metadata/thumbnail',
RELATION_TYPE_CORE_PROP =>
'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
RELATION_TYPE_FB3_IMAGES =>
'http://www.fictionbook.org/FictionBook3/relationships/image',
};
use constant {
NS_XLINK => 'http://www.w3.org/1999/xlink',
NS_FB3_DESCRIPTION => 'http://www.fictionbook.org/FictionBook3/description',
NS_FB3_BODY => 'http://www.fictionbook.org/FictionBook3/body'
};
my %AttrHash = (
'article' => 'artc',
'float' => 'fl',
'align' => 'aln',
'valign' => 'valn',
'bindto' => 'bnd',
'border' => 'brd',
'on-one-page' => 'op',
'colspan' => 'csp',
'rowspan' => 'rsp',
'class' => 'nc',
'autotext' => 'att',
'page-before' => 'pb',
'page-after' => 'pa',
'width' => 'wth',
'min-width' => 'minw',
'max-width' => 'maxw',
'clipped' => 'cl',
'first-char-pos' => 'fcp',
);
my %LangDependentStr = (
'ru' => ['Конец ознакомительного фрагмента'],
'uk' => ['Кінець ознайомчого фрагмента'],
'en' => ['The end of the free preview'],
'de' => ['Ende des Fragments'],
'fr' => ['Fin du fragment'],
'az' => ['Fragmanın sonu'],
'be' => ['Канец азнаямленчага фрагмента'],
'bn' => ['ফ্র্যাগমেন্ট সম্পূর্ণ হয়েছে'],
'bg' => ['Край на фрагмента'],
'el' => ['Τέλος του θραύσματος'],
'ka' => ['ფრაგმენტის დასასრული'],
'da' => ['Fragmentet afsluttet'],
'es' => ['Fin del fragmento'],
'it' => ['Fine del frammento'],
'kk' => ['Фрагменттің соңы'],
'ca' => ['Fi del fragment'],
'ky' => ['Сот жазманын'],
'zn' => ['片段結束'],
'ko' => ['프래그먼트의 끝'],
'la' => ['Finis iudicii fragmentum'],
'lv' => ['Fragments beigas'],
'lt' => ['Fragmento pabaiga'],
'mk' => ['Крајот на фрагментот'],
'no' => ['Slutten av fragmentet'],
'pl' => ['Koniec fragmentu'],
#'pt' => ['Koniec fragmentu'],
'ro' => ['Sfârșitul fragmentului'],
'sr' => ['Фрагмент завршен'],
'sk' => ['Konec fragmenta'],
'fi' => ['Fragmentin loppu'],
'hbs' => ['Kraj fragmenta'],
'cs' => ['Konec fragmentu'],
'sv' => ['Fragmentet slutfört'],
'eo' => ['Fino de fragmento'],
'et' => ['Fragment lõpp'],
'ja' => ['断片の終わり'],
'nl' => ['Einde fragment'],
);
$Lang = 'en' unless $LangDependentStr{$Lang}[0];
my @AuthorsPriority = qw(author co_author dubious_author lecturer compiler screenwriter translator contributing_editor managing_editor editor editorial_board_member adapter conceptor rendering associated commentator consultant scientific_advisor recipient_of_letters corrector composer);
my $NoHyphRe = qr/^(poem|epigraph|subtitle|title)$/; # Ноды, текст которых(и их потомков) нельзя переносить
my $Semiblocks = qr/^(epigraph|annotation|poem|stanza)$/;
my $BlockPParents = qr/^(section|annotation|epigraph|notebody)$/;
my $LineBreakChars = qr/[\-\/…\?\!\}\|–—]/;
my $SubtitleParents = qr/^(section)$/;
# ------------------------ Hyphenation settings ------------------------------
use constant HYPHEN => "\x{AD}"; #visible in, e.g., Komodo Edit.
my ($hyphenPatterns, $hyphenRegexPattern, $soglasnie, $glasnie, $znaki, $RgxSoglasnie, $RgxGlasnie, $RgxZnaki, $RgxNonChar);
$hyphenPatterns = {
GSS => 'GS' . &HYPHEN . 'S',
SGSG => 'SG' . &HYPHEN . 'SG',
SQS => 'SQ' . &HYPHEN . 'S',
GG => 'G' . &HYPHEN . 'G',
SS => 'S' . &HYPHEN . 'S'
};
$hyphenRegexPattern = join "|",keys %{$hyphenPatterns};
$hyphenRegexPattern = qr/(.*)($hyphenRegexPattern){1}(.*)/o;
$soglasnie = "bcdfghjklmnpqrstvwxzбвгджзйклмнпрстфхцчшщłćżźśńżçßґñšžčģķļņბგდვზთკლმნპჟრსტჳფქღყშჩცძწჭხჴჯჰჶ";
$glasnie = "aeiouyАОУЮИЫЕЭЯЁєіїўóąęéàèùâêîôûëïüÿæœäöõāēīūėįųაეჱიჲოუჷჵ";
$znaki = "ъьყ";
$RgxSoglasnie = qr/[$soglasnie]/oi;
$RgxGlasnie = qr/[$glasnie]/oi;
$RgxZnaki = qr/[$znaki]/oi;
$RgxNonChar = qr/([^$soglasnie$glasnie$znaki]+)/oi; #в скобках, чтобы оно возвращалось при сплите.
# /----------------------- Hyphenation settings ------------------------------
my $jsonC = JSON::PP->new->pretty->allow_barekey;
my $FB3Package = FB3->new( from_dir => $FB3 );
my $FB3Body = $FB3Package->Body;
my $Parser = XML::LibXML->new();
my $BodyDoc = $Parser->load_xml( string => $FB3Body->Content, huge => 1 );
my $TOCHeader = ProceedDescr($FB3Package->Meta->Content);
my @Img = $FB3Body->Relations( type => RELATION_TYPE_FB3_IMAGES );
my @ImgFiles;
for (@Img) {
push @ImgFiles, basename($_->{'TargetFullName'});
}
my $ImgFileNum = 0; # номер файла для замены
my $ImgHash;
for my $Image (@Img) {
$ImgHash->{$Image->{Id}}->{name} = basename($Image->{'TargetFullName'});
unless ($ImgHash->{$Image->{Id}}->{name} =~ /^[A-Za-z0-9_\-]+\..+$/) { # Если имя файла странное - переименуем
$ImgHash->{$Image->{Id}}->{name} =~ /(\.[^.]+)$/;
while (grep('renamed_img_'.$ImgFileNum.$1 eq $_, @ImgFiles)) { # проверка, что такого имени файла у нас еще нет(если есть - следующий номер и т.д.)
$ImgFileNum++;
}
$ImgHash->{$Image->{Id}}->{name} = 'renamed_img_'.$ImgFileNum.$1;
$ImgFileNum++;
}
my $FN = $Out.$ImgHash->{$Image->{Id}}->{name};
my $PhysicalName = $FB3Package->{opc}->PhysicalNameByPartName($Image->{'TargetFullName'});
File::Copy::copy( $PhysicalName, $FN );
($ImgHash->{$Image->{Id}}->{height}, $ImgHash->{$Image->{Id}}->{width}) = GetImgSize($FN);
}
ProceedBody($FB3Body->Content);
my $BlockN;
my @Parts;
sub ProceedBody {
my $BodyXML = shift;
$BodyXML = DecodeUtf8($BodyXML);
$BodyXML =~ s/\r?\n\r?/ /g;
$BodyXML =~ s/([\s>])([^\s<>]+)(<note\s+[^>]*?role="(foot|end)note"[^>]*?>[^<]{1,10}<\/note>[,\.\?"'“”«»‘’;:\)…\/]?)/$1.HypheNOBR($2,$3)/ges;
my $JsonStr;
my $TOCStr;
my $TotalBlocks;
# Начинаем обход всех узлов дерева
my $Node = $BodyDoc->getDocumentElement();
$Node = PrepareBodyXML($Node, $BodyDoc);
my $NodeTree = ProceedNode($Node);
my $RootLevelTOC = DumpRootLevelTOC($NodeTree);
$JsonStr = DumpTree($NodeTree);
$TOCStr = '{'.$TOCHeader.'Body: ['.$RootLevelTOC.'],Parts:['.(join ",\n", @Parts).']}';
ProceedJsonBodyPart($TOCStr, 'toc.js');
return;
}
my %InnerRefsHash;
sub ProceedNode {
my $Node = shift || return;
my $ParentName = shift || undef;
my $NoHyph = shift || 0;
my $Xp = shift || undef;
my $NoCut = shift;
my $NoteType = shift;
my $InEmptySection = shift || 0;
my $NodeName = $Node->nodeName;
my $NodeHash;
if ($NoHyph || $NodeName =~ /$NoHyphRe/) {
$NodeHash->{no_hyph} = 1;
}
$Xp = 1 if $NodeName eq 'fb3-body';
if ($NodeName eq '#text') {
my $Text = $Node->nodeValue;
return if $Text =~ /^\s+$/;
return unless defined $Text;
$Text = '['.$Text.']' if $NoteType eq 'footnote';
$Text = '{'.$Text.'}' if $NoteType eq 'endnote';
$NodeHash->{text} = $Text;
$NodeHash->{xp} = $Xp;
return $NodeHash;
}
for my $Attr ($Node->attributes) {
my $AttrName = $Attr->getName;
$AttrName =~ s/^.+://; # Откусим ns
if ($AttrName eq 'empty') {
$NodeHash->{in_empty_section} = 1;
} else {
$NodeHash->{attr}->{$AttrName} = EscString($Attr->getValue);
}
}
$NodeHash->{in_empty_section} = 1 if $InEmptySection;
$NoteType = $NodeHash->{attr}->{role};
$NodeHash->{xp} = $Xp;
$NodeHash->{name} = $NodeName;
my ($IsBlock, $IsRootBlock, $Printable) = AnalyseNode($Node, $NodeName, $ParentName);
$Printable = 0 if $NodeHash->{in_empty_section};
$NodeHash->{no_cut} = $NoCut;
if ($Printable && !$NodeHash->{no_cut}) {
$NodeHash->{b_id} = $BlockN || '0';
$NoCut = 1;
$BlockN++;
}
$NodeHash->{pr} = $Printable;
$NodeHash->{rb} = $IsRootBlock;
my @Childs;
my $i = 1;
for my $Child ($Node->childNodes) {
my $ChildHash = ProceedNode($Child, $NodeName, $NodeHash->{no_hyph}, $Xp.','.$i, $NoCut, $NoteType, $NodeHash->{in_empty_section});
next unless defined $ChildHash;
push @Childs, $ChildHash;
$i++;
}
$NodeHash->{c} = [@Childs] if scalar @Childs;
my $Id = $Node->getAttribute('id');
if ($Id) {
if ($NodeHash->{pr}) {
$InnerRefsHash{$Id} = $NodeHash->{b_id};
} else {
MoveRefToPrintableChild($NodeHash, $Id); # Если этот блок не отражается в json, то его первому потомку, который отражается
}
}
my $Href = $NodeHash->{attr}->{href};
if (defined $Href) {
$Href =~ s/^\#//g;
$NodeHash->{href} = $Href;
delete $NodeHash->{attr}->{href};
}
return $NodeHash;
}
sub MoveRefToPrintableChild {
my $NodeHash = shift;
my $Id = shift;
foreach (@{$NodeHash->{c}}) {
if ($_->{pr} && $_->{b_id}) { # Потомок нам подходит
$InnerRefsHash{$Id} = $_->{b_id};
return 1;
} else { # Проходим по его потомкам
if (MoveRefToPrintableChild($_, $Id)) {
return 1;
}
}
}
}
my @ResultArr;
my ($Length, $FirstBlockN, $LastBlockN, $FirstXP, $LastXP);
my $FileN = 0;
sub DumpTree {
my $NodeHash = shift || return;
my $JsonStr;
if (defined $NodeHash->{text} || $NodeHash->{pr}) {
$FirstXP = $NodeHash->{xp} if !$FirstXP && $NodeHash->{b_id};
$LastXP = $NodeHash->{xp} if $NodeHash->{b_id};
}
if (defined $NodeHash->{text}) {
$Length += length($NodeHash->{text});
$JsonStr = ProceedTextNode($NodeHash->{text}, $NodeHash->{no_hyph} == 1 ? 0 : 1);
return $JsonStr;
} elsif ($NodeHash->{pr} == 1) {
my $AttrStr = join ( ',', map { ($AttrHash{$_} ? $AttrHash{$_} : $_).':"'.$NodeHash->{attr}->{$_}.'"' } keys %{$NodeHash->{attr}}); # Дампим все атрибуты в строку
$AttrStr = ','.$AttrStr if $AttrStr;
if ($NodeHash->{name} eq 'br') {
$JsonStr = '{t:"'.$NodeHash->{name}.'"'.$AttrStr.',xp:['.$NodeHash->{xp};
} elsif ($NodeHash->{name} eq 'img') {
$JsonStr = '{t:"'.$NodeHash->{name}.'"'.$AttrStr.',xp:['.$NodeHash->{xp}.'],s:"'.$ImgHash->{$NodeHash->{attr}->{src}}->{name}.'"';
$JsonStr .= ',w:' . ($ImgHash->{$NodeHash->{attr}->{src}}->{width}) if ($ImgHash->{$NodeHash->{attr}->{src}}->{width});
$JsonStr .= ',h:' . ($ImgHash->{$NodeHash->{attr}->{src}}->{height}) if ($ImgHash->{$NodeHash->{attr}->{src}}->{height});
$JsonStr .= '}';
} else {
$JsonStr = '{t:"'.$NodeHash->{name}.'"'.$AttrStr.',xp:['.$NodeHash->{xp}.'],c:[';
}
$JsonStr .= "\n" if $NodeHash->{name} =~ /$Semiblocks/;
} else {
$JsonStr = '';
}
my $ChildsCount = scalar @{$NodeHash->{c}} if $NodeHash->{c};
$FirstBlockN = ($NodeHash->{b_id}||0) unless defined $FirstBlockN;
$LastBlockN = defined $NodeHash->{b_id} ? $NodeHash->{b_id} : $LastBlockN ? $LastBlockN : 0;
for my $ChildHash (@{$NodeHash->{c}}) {
$ChildsCount--;
$JsonStr =~ s/,$/],f:/g if $ChildHash->{name} eq 'footnote'; # Лучше бы что другое придумать, но работает
my $ChildStr .= DumpTree($ChildHash);
if ($ChildStr) {
$JsonStr .= $ChildStr;
$JsonStr .= ',' if $ChildsCount && $ChildStr;
}
}
$JsonStr .= "\n" if $NodeHash->{name} =~ /$Semiblocks/;
if ( defined $NodeHash->{href}) {
if ($InnerRefsHash{$NodeHash->{href}}) {
$JsonStr .= '],hr:['.$InnerRefsHash{$NodeHash->{href}}.']';
} else {
$JsonStr .= '],href:["'.$NodeHash->{href}.'"]';
}
}
$JsonStr .= ']}' if ($NodeHash->{pr} == 1 && $NodeHash->{name} ne 'img' && $NodeHash->{name} ne 'note' && $NodeHash->{name} ne 'a');
$JsonStr .= '}' if ($NodeHash->{name} eq 'note' || $NodeHash->{name} eq 'a');
if ($NodeHash->{pr} == 1 && !$NodeHash->{no_cut}) {
push @ResultArr, $JsonStr;
}
if (($NodeHash->{b_id} && $Length > $PartLimit) || ($NodeHash->{name} eq 'fb3-body' && $Length > 0)) {
my $ResultStr = join ",\n",@ResultArr;
$ResultStr =~ s/,\n$//g;
if (trim($ResultStr)) {
$ResultStr = '['.$ResultStr.']';
my $FileName = sprintf("%03i.js",$FileN);
ProceedJsonBodyPart($ResultStr, $FileName);
$FileN++;
@ResultArr = ();
$Length = 0;
push @Parts, '{s:'.$FirstBlockN.',e:'.$LastBlockN.',xps:['.$FirstXP.'],xpe:['.$LastXP.'],url:"'.$FileName.'"}';
$FirstBlockN = undef;
$FirstXP = undef;
}
}
return $JsonStr;
}
my $LastBlock;
sub DumpRootLevelTOC {
my $NodeHash = shift || return;
my $TOCStr;
my @TocStrArr;
$LastBlock = $NodeHash->{b_id} if $NodeHash->{b_id};
if ($NodeHash->{rb}) {
$NodeHash->{b_id} ||= $NodeHash->{c}[0]->{b_id};
$LastBlock = $NodeHash->{b_id} if $NodeHash->{b_id};
push @TocStrArr, 's:'.($LastBlock ? $LastBlock : '0') unless $NodeHash->{in_empty_section};
my @Clilds;
my $ChildsSection;
my $Title;
my $TotalClipped = 0;
if ($NodeHash->{c}[0]->{name} eq 'title') {
$NodeHash->{c}[0]->{'intitle'} = 1;
$Title = 't:"'.ExtractText($NodeHash->{c}[0]).'"';
}
for my $ChildHash (@{$NodeHash->{c}}) {
if ($ChildHash->{name} eq 'clipped') {
$TotalClipped = 1;
}
my $ChildStr = DumpRootLevelTOC($ChildHash);
push @Clilds, $ChildStr if $ChildStr;
}
if (scalar @Clilds) {
$ChildsSection = 'c:[';
$ChildsSection .= join ",",@Clilds;
$ChildsSection .= ']';
}
push @TocStrArr, 'e:'.($LastBlock ? $LastBlock : '0') unless $NodeHash->{in_empty_section};
push @TocStrArr, $Title if $Title;
push @TocStrArr, $AttrHash{'first-char-pos'}.':'.$NodeHash->{attr}->{'first-char-pos'} if $NodeHash->{attr}->{'first-char-pos'};
push @TocStrArr, $AttrHash{clipped}.':"true"' if $NodeHash->{attr}->{clipped} eq 'true';
push @TocStrArr, 'tcl:"true"' if $TotalClipped;
push @TocStrArr, $ChildsSection if $ChildsSection;
$TOCStr = '{';
$TOCStr .= join ",",@TocStrArr;
$TOCStr .= '}';
}
return $TOCStr;
}
sub ExtractText {
my $NodeHash = shift || return;
my @TextArr;
for my $ChildHash (@{$NodeHash->{c}}) {
$ChildHash->{'intitle'} = 1 if exists $NodeHash->{'intitle'} && $NodeHash->{'intitle'};
#note внутри title нам не нужны в оглавлении
next if $ChildHash->{'name'} eq 'note' && exists $ChildHash->{'intitle'} && $ChildHash->{'intitle'};
push @TextArr, $ChildHash->{text} if $ChildHash->{text};
my $Text = ExtractText($ChildHash);
$Text = EscString($Text);
push @TextArr, $Text if $Text;
}
return join ' ', @TextArr;
}
sub PrepareBodyXML {
my $Node = shift || undef;
my $BodyDoc = shift;
my $FootNoteHash;
my $xpc = XML::LibXML::XPathContext->new($Node);
$xpc->registerNs('fbb', &NS_FB3_BODY);
for my $Note ($xpc->findnodes('/fbb:fb3-body/fbb:notes', $Node)) {
if ($Note->getAttribute('show') == 0) {
for my $NoteBody ($xpc->findnodes('./fbb:notebody', $Note)) {
$FootNoteHash->{$NoteBody->getAttribute('id')} = ($xpc->findnodes('./*', $NoteBody));
}
$Note->unbindNode();
}
}
for my $NoteRef ($xpc->findnodes('//fbb:note')) {
if ($NoteRef->getAttributeNS(NS_XLINK, 'role') eq 'footnote') {
my $Id = $NoteRef->getAttribute('href');
$NoteRef->removeAttribute('href');
my $FootNoteNode = $NoteRef->appendChild($BodyDoc->createElement('footnote'));
for my $N (@{$FootNoteHash->{$Id}}) {
$FootNoteNode->appendChild($N);
}
}
}
if ($IsTrial) {
for my $SectionNode ($xpc->findnodes('//fbb:section', $Node)) {
if (IsEmptySection($SectionNode)) {
$SectionNode->setAttribute('empty', 1)
}
}
my $NewSectionNode = $BodyDoc->createElementNS('fbb', 'section');
my $NewPNode = $BodyDoc->createElementNS('fbb', 'p');
$NewPNode->appendChild($BodyDoc->createTextNode($LangDependentStr{$Lang}[0]));
$NewSectionNode->appendChild($NewPNode);
my $FB3BodyNode = $xpc->findnodes('fbb:fb3-body', $BodyDoc)->[0];
$FB3BodyNode->appendChild($NewSectionNode);
}
return $Node;
}
sub IsEmptySection {
my $SectionNode = shift || return;
my $IsEmpty;
for my $Child ($SectionNode->nonBlankChildNodes) {
my $ChildName = $Child->nodeName;
if ($ChildName ne 'clipped' && $ChildName ne 'title' && $ChildName ne 'section') {
return 0;
} elsif ($ChildName eq 'clipped') {
return 1;
} elsif ($ChildName eq 'section') {
$IsEmpty = IsEmptySection($Child);
return 0 unless $IsEmpty;
}
}
return 1 if $IsEmpty;
}
sub ProceedJsonBodyPart {
my $JsonStr = shift;
my $FileName = shift;
my $JData;
eval { $JData = $jsonC->decode($JsonStr); };
if ($@) {
# Такой json нам не нужен
die "$JsonStr\n===============\n$@";
#warn "\n$FileName: $@"; # Для отладки и теста. В боевых условиях падать, если фигня получается.
}
open TMPOUT, ">", $Out.$FileName or die "Cannot open tmp file: `$Out.$FileName'";
print TMPOUT $JsonStr;
close TMPOUT;
return $FileName;
}
sub ProceedDescr {
my $DescrXML = shift;
$DescrXML = DecodeUtf8($DescrXML);
$DescrXML =~ s/\r?\n\r?/ /g;
my @description_data = ();
my $Parser = XML::LibXML->new();
my $xpc = XML::LibXML::XPathContext->new($Parser->load_xml( string => $DescrXML, huge => 1 ));
$xpc->registerNs('fbd', &NS_FB3_DESCRIPTION);
my $UUID = ($xpc->findnodes('/fbd:fb3-description')->[0])->getAttribute('id');
my $SimpleFields = {
'Title' => 'fbd:title/fbd:main',
'Subtitle' => 'fbd:title/fbd:sub',
'Lang' => 'fbd:lang',
'Annotation' => 'fbd:annotation',
'Preamble' => 'fbd:preamble',
'Translated' => 'fbd:translated'
};
for my $field ( sort keys %$SimpleFields ) {
my $node = $xpc->findnodes('/fbd:fb3-description/' . $SimpleFields->{$field})->[0] || next;
my $value = EscString($node->string_value) || next;
push @description_data, sprintf('%s:"%s"', $field, $value);
}
my $description = join ',', @description_data;
my @Sequences = ();
my $getSequence; $getSequence = sub {
my @sequenceNodes = @_;
for my $sequenceNode ( @sequenceNodes ) {
push @Sequences, '"' . EscString($xpc->findnodes('./fbd:title/fbd:main', $sequenceNode)->[0]->string_value) . '"';
$getSequence->( $xpc->findnodes('./fbd:sequence', $sequenceNode) );
}
};
$getSequence->( $xpc->findnodes('/fbd:fb3-description/fbd:sequence') );
$description .= ',Sequences:[' . join(',', @Sequences) . ']' if scalar @Sequences;
my $getParts = sub {
my $node = shift;
my $struct = shift;
my @values = ();
for my $part ( keys %$struct ) {
my $subNode = $xpc->findnodes('./fbd:' . $struct->{$part}, $node)->[0] || next;
my $value = EscString($subNode->string_value) || next;
push @values, sprintf('%s:"%s"', $part, $value);
}
return join(',', @values);
};
my $getAuthorNamePart = sub {
my $node = shift;
my $NameParts = {'First' => 'first-name', 'Last' => 'last-name', 'Middle' => 'middle-name'};
return $getParts->($node, $NameParts);
};
my $Created = $xpc->findnodes('/fbd:fb3-description/fbd:document-info/@created');
$description .= ',Created:"'.$Created.'"' if $Created;
my $Updated = $xpc->findnodes('/fbd:fb3-description/fbd:document-info/@updated');
$description .= ',Updated:"'.$Updated.'"' if $Updated;
if ( my $WrittenNode = $xpc->findnodes('/fbd:fb3-description/fbd:written')->[0] ) {
my $WrittenParts = {'Date' => 'date', 'DatePublic' => 'date-public', 'Lang' => 'lang'};
my $Written = $getParts->($WrittenNode, $WrittenParts);
$description .= ',Written:{' . $Written . '}' if scalar $Written;
}
my @Authors;
foreach (@AuthorsPriority) {
for my $Author ($xpc->findnodes('/fbd:fb3-description/fbd:fb3-relations/fbd:subject[@link="'.$_.'"]')) {
push @Authors, '{Role:"' . $_ . '",' . $getAuthorNamePart->($Author) . '}';
}
}
$description .= ',Authors:[' . (join ",", @Authors) . ']' if scalar @Authors;
my @Translators;
for my $Translator ($xpc->findnodes('/fbd:fb3-description/fbd:fb3-relations/fbd:subject[@link="translator"]')) {
push @Translators, '{Role:"translator",' . $getAuthorNamePart->($Translator) . '}';
}
$description .= ',Translators:[' . (join ",", @Translators) . ']' if scalar @Translators;
my @Relations;
for my $ObjectNode ($xpc->findnodes('/fbd:fb3-description/fbd:fb3-relations/fbd:object')) {
my $ObjectId = EscString( $ObjectNode->getAttribute('id') );
my $ObjectType = EscString( $ObjectNode->getAttribute('link') );
my $ObjectTitle = EscString($xpc->findnodes('./fbd:title/fbd:main', $ObjectNode)->[0]->string_value);
push @Relations, sprintf('{id:"%s",type:"%s",title:"%s"}', $ObjectId, $ObjectType, $ObjectTitle);
}
$description .= ',Relations:[' . (join ",", @Relations) . ']' if scalar @Relations;
$description .= ',ArtID:"' . EscString($ArtID) . '"' if $ArtID;
my $DraftStr = '';
if ( my $FragmentNode = $xpc->findnodes('/fbd:fb3-description/fbd:draft-status')->[0]) {
$DraftStr = ',DraftStatus:{expected_chars:"'.$FragmentNode->getAttribute('expected-chars').'"'; #required
if (my $ExpFreq = $FragmentNode->getAttribute('expected-frequency')) {
$DraftStr .= ',expected_frequency:"'.$ExpFreq.'"';
}
if (my $ExpRelease = $FragmentNode->getAttribute('expected-release')) {
$DraftStr .= ',expected_release:"'.$ExpRelease.'"';
}
$DraftStr .= '}';
$description .= $DraftStr;
}
my $LengthStr = '';
if ( my $FragmentNode = $xpc->findnodes('/fbd:fb3-description/fbd:fb3-fragment')->[0] ) {
$IsTrial = 1;
$LengthStr = ",\n".'"fb3-fragment":{"full_length":'.$FragmentNode->getAttribute('full_length').',"fragment_length":'.$FragmentNode->getAttribute('fragment_length').'}';
} else {
my $RootNode = $BodyDoc->getDocumentElement();
my $CharsFull = length($RootNode->textContent);
$xpc->registerNs('fbb', &NS_FB3_BODY);
foreach my $TrialOnlyNode ($xpc->findnodes('/fbb:fb3-body/fbb:section[@output="trial-only"]',$RootNode)) {
$CharsFull -= (length($TrialOnlyNode->textContent) || 0);
}
$LengthStr = ",\n".'"full_length":'.$CharsFull;
}
return 'Meta:{' . $description . ',UUID:"' . $UUID . '",version:"' . $Version . '"}' . $LengthStr . ",\n";
}
sub DecodeUtf8 {
my $Out = shift;
unless (Encode::is_utf8($Out)) {
$Out = Encode::decode_utf8($Out);
}
return $Out;
}
sub EscString{
my $Esc=shift;
return unless defined $Esc;
$Esc = DecodeUtf8($Esc." ");
$Esc =~ s/(["\\])/\\$1/g;
$Esc =~ s/(?:\r?\n\r?)|(?:\t)/ /g;
$Esc =~ s/ $//;
return $Esc;
}
my %HyphCache;
sub ProceedTextNode{
my $Str=shift;
my $NeedHyph = shift;
return unless defined $Str;
$Str = EscString($Str);
return if $Str =~ /^\s+$/;
my $SRC = $Str;
if ($NeedHyph){
$Str = $HyphCache{$SRC} || HyphString($Str);
}
$Str =~ s/[ \t]+/ ","/g;
$Str =~ s/($LineBreakChars)(?![ "])/$1","/g;
$Str =~ s/"",|,""|","$//g;
if ($NeedHyph){
$HyphCache{$SRC} = $Str;
$Str =~ s/\x{AD}/","\x{AD}/g; # Full version
}
$Str = '"'.$Str.'"';
return $Str;
}
sub AnalyseNode {
my $Node = shift || return;
my $NodeName = shift;
my $ParentName = shift;
my $IsBlock = 0;
my $IsRootBlock = 0;
my $Printable = 1;
if ($NodeName eq 'section' || $NodeName eq 'fb3-body' || $NodeName eq 'notes' || $NodeName eq 'notebody') {
$IsBlock = 1;
$IsRootBlock = 1;
$Printable = 0;
} elsif (($NodeName eq 'p' && $ParentName =~ /$BlockPParents/) || $NodeName eq 'title' || $NodeName eq 'subtitle' || $NodeName eq 'epigraph' ||
$NodeName eq 'annotation' || $NodeName eq 'div' || $NodeName eq 'blockquote' || $NodeName eq 'subscription') {
$IsBlock = 1;
}
return ($IsBlock, $IsRootBlock, $Printable);
}
sub HypheNOBR {
my ($Word, $NOBRCharSeq) = @_;
# $Word = EscString($Word);
my $Esc = $HyphCache{$Word} || HyphString($Word);
unless ($Esc =~ s/\xAD?([^\xAD]+)$/<nobr>$1/s) {
$Esc = '<nobr>'.$Esc;
}
$Esc =~ s/\xAD//gis;
return $Esc . $NOBRCharSeq . '</nobr>';
}
sub GetImgSize {
my $File = shift;
open(my $TempStdErr, ">&STDERR"); # Image::Info много и не по делу говорит, воткнём кляп.
close STDERR;
my $ImgInfo = Image::Info::image_info($File);
open(STDERR, ">&", $TempStdErr); # А вот остальных послушаем.
$ImgInfo->{height} =~ s/\D//g;
my $Height = $ImgInfo->{height};
$ImgInfo->{width} =~ s/\D//g;
my $Width = $ImgInfo->{width};
return ($Height, $Width);
}
sub trim {
my $str = shift;
$str =~ s/^\s+//s;
$str =~ s/\s+$//s;
return $str;
}
# ------------------------ Hyphenation functions ------------------------------
sub HyphString {
use utf8;
my $word = shift;
return $word if $CannotHyph;
my @wordArrayWithUnknownSymbols = split $RgxNonChar , $word; #собрали все слова и неизвестные символы. Для слова "пример!№?;слова" будет содержать "пример", "!№?;", "слова".
for my $word (@wordArrayWithUnknownSymbols) {
next if $word =~ $RgxNonChar;
if (-e $Dictionary) {
$word = $Hyp->visualize($word);
$word =~ s/-/\x{AD}/g;
} else {
next if $Lang eq 'pl';
$word = HyphParticularWord($word);
}
}
return join "", @wordArrayWithUnknownSymbols;
}
sub HyphParticularWord {
use utf8;
my $word = shift;
my $softHyphMinPart = 2;
return $word if ( length($word) < 2 * $softHyphMinPart + 1 || $word eq uc($word));
my $wordCopy = $word; #чтобы сохранить оригинальное слово. А $word заменим структурным эквивалентном
$word =~ s/$RgxSoglasnie/S/g;
$word =~ s/$RgxGlasnie/G/g;
$word =~ s/$RgxZnaki/Q/g;
while ($word =~ s/$hyphenRegexPattern/Hyphenate($1,$2,$3,\$wordCopy,$softHyphMinPart)/ge) {}
return $wordCopy;
}
sub Hyphenate {
use utf8;
my ($leftFromPattern,$pattern,$rightFromPattern,$wordCopyRef,$softHyphMinPart) = @_;
my $leftOffsetOfCurrentHyphen = length($leftFromPattern) + index($hyphenPatterns->{$pattern},&HYPHEN);
my $rightOffsetOfCurrentHyphen = length(${$wordCopyRef}) - $leftOffsetOfCurrentHyphen; #слева дефисы не добавляются. Они добавляются справа
substr(${$wordCopyRef}, 0, $leftOffsetOfCurrentHyphen) .= &HYPHEN
if ($leftOffsetOfCurrentHyphen >= $softHyphMinPart && $rightOffsetOfCurrentHyphen >= $softHyphMinPart);
#переносы ставим только если остается у нас в конце и в начале по softHyphMinPart символов
return $leftFromPattern . $hyphenPatterns->{$pattern} . $rightFromPattern; #новую структуру кидаем в структурный эквивалент
}
1;
__DATA__
Usage:
fb3_2_json.pl --fb3 /path/to/fb3/dir --json /path/to/json/dir [ --version <file version> ] [ --lang <file language> ] [ --art-id <id> ] [ --dict <path> ]
e.g.
fb3_2_json.pl --fb3 /tmp/fb3 --json /tmp/json
fb3_2_json.pl --fb3 /tmp/fb3 --json /tmp/json --lang ru --art-id 1234567
fb3_2_json.pl --fb3 /tmp/fb3 --json /tmp/json --version 2.1 --lang es --art-id 1234567