Yet Another CPAN Grep

EAI-Wrap/lib/EAI/File.pm

package EAI::File 1.919;

use strict; use feature 'unicode_strings'; use warnings; no warnings 'uninitialized';
use Exporter qw(import); use Text::CSV(); use Data::XLSX::Parser(); use Spreadsheet::ParseExcel(); use Spreadsheet::WriteExcel(); use Excel::Writer::XLSX(); use XML::LibXML(); use XML::LibXML::Debugging();
use Carp qw(confess longmess); use Data::Dumper qw(Dumper); use Log::Log4perl qw(get_logger); use Time::localtime; use Scalar::Util qw(looks_like_number); use EAI::DateUtil; use EAI::Common;

our @EXPORT = qw(readText readExcel readXML writeText writeExcel);

# get common read procedure parameters from $File config, used in readText, readExcel and readXML
sub getcommon ($) {
	my ($File) = @_;
	my $lineProcessing = $File->{lineCode};
	my $fieldProcessing = $File->{fieldCode};
	my $firstLineProc = $File->{firstLineProc};
	my $thousandsep = "\\".$File->{format_thousandsep}; # add backslash to quote thousandsep for regexp (in case it is a ".")
	my $decimalsep = "\\".$File->{format_decimalsep}; # add backslash to quote decimalsep for regexp (in case it is a ".")
	my $skip = $File->{format_skip} if $File->{format_skip};
	my $sep = $File->{format_sep} if $File->{format_sep};
	$sep = $File->{format_defaultsep} if !$sep; # use default if not given
	# for fixed format and non existing separator, header and targetheader strings are parsed/split using tab as separator
	my @header = split(($sep =~ /^fix/ || !$sep ? "\t" : $sep), $File->{format_header}) if $File->{format_header};
	my @targetheader = split(($sep =~ /^fix/ || !$sep ? "\t" : $sep), $File->{format_targetheader}) if $File->{format_targetheader};
	$Data::Dumper::Terse = 1;
	get_logger()->debug("\$skip:$skip\n\$sep:".Data::Dumper::qquote($sep)."\n\@header:@header\n\@targetheader:@targetheader\n\$lineProcessing:$lineProcessing\n\$fieldProcessing:$fieldProcessing\n\$firstLineProc:$firstLineProc\n\$thousandsep:$thousandsep\n\$decimalsep:$decimalsep");
	$Data::Dumper::Terse = 0;
	return ($lineProcessing,$fieldProcessing,$firstLineProc,$thousandsep,$decimalsep,$sep,$skip,\@header,\@targetheader);
}

# read text files
sub readText ($$$;$$) {
	my ($File,$data,$filenames,$redoSubDir,$countPercent) = @_;
	my $logger = get_logger();
	my @filenames = @{$filenames} if $filenames;
	if (!@filenames) {
		$logger->error("no filenames passed".longmess());
		return 0;
	}
	# read format configuration
	my ($lineProcessing,$fieldProcessing,$firstLineProc,$thousandsep,$decimalsep,$sep,$skip,$header,$targetheader) = getcommon($File);
	my @header = @$header; my @targetheader = @$targetheader;
	my ($poslen, $isFixLen); 
	if ($sep =~ /^fix/) {
		# positions/length definitions from poslen definition: e.g. "format_poslen => [[0,3],[3,3]]"
		$poslen = $File->{format_poslen};
		if (!$poslen) {
			$logger->error("no format_poslen array given for parsing fix length format".longmess());
			return 0;
		}
		$isFixLen = 1;
	} else {
		if (!$sep) {
			$logger->error("no separator set in ".Dumper($File).longmess());
			return 0;
		}
		if ($File->{format_quotedcsv} and ref($sep) eq "Regexp") {
			$logger->error("no regex separator allowed with format_quotedcsv".longmess());
			return 0;
		}
		$logger->warn("string separator assumed to be a regular expression for splitting textfile without format_quotedcsv, this may produce unexpected results".longmess()) if (!$File->{format_quotedcsv} and !$File->{format_autoheader} and !(ref($sep) eq "Regexp"));
	}
	my $autoheader = $File->{format_autoheader} if $File->{format_autoheader};
	@targetheader = @header if !@targetheader; # if no specific targetheader defined use header instead
	# read all files with same format
	for my $filename (@filenames) {
		my $lines = 0; my $buffer;
		open my($fh), '<:raw', $redoSubDir.$filename;
		while( sysread $fh, $buffer, 4096 ) {
			$lines += ($buffer =~ s!$/!!g);
		}
		close $fh;
		$logger->debug("reading $redoSubDir$filename");
		open (FILE, "<".$File->{format_encoding}, $redoSubDir.$filename) or do { #
			if (! -e $redoSubDir.$filename) {
				$logger->error("no file $redoSubDir$filename to process...".longmess()) unless ($File->{optional});
				$logger->warn("no file $redoSubDir$filename found... ".longmess());
			} else {
				$logger->error("file open error: $!".longmess());
			}
			return 0;
		};
		my $sv = Text::CSV->new ({
			binary    => 1,
			auto_diag => 1,
			sep_char  => $sep,
			eol => ($File->{format_eol} ? $File->{format_eol} : $/),
		});
		# local context for special line record separator
		{
			my $newRecSep;
			if ($File->{format_allowLinefeedInData}) {
				# enable binmode and set line record separator to CRLF, so line feeds in values don't create artificial new lines/records
				binmode(FILE, ":raw".$File->{format_encoding}); # raw so not to swallow CRLF
				$newRecSep = "\015\012";
				$logger->debug("binmode");
			}
			# change record separator (standard CRLF), if needed
			local $/ = $newRecSep if $newRecSep;
			my @layers = PerlIO::get_layers(FILE);
			$logger->info("reading file $redoSubDir$filename, layers: @layers");
			if ($firstLineProc) {
				$_ = <FILE>;
				eval $firstLineProc;
				$logger->error("eval firstLineProc: ".$firstLineProc.$@.longmess()) if ($@);
				$logger->debug("evaled: ".$firstLineProc);
				$lines--;
			}
			if ($skip) {
				$skip-- if $firstLineProc; # if consumed already by firstLineProc skip one row less
				$logger->debug("skipping ".($skip =~ /^\d+$/ ? " $skip lines" : "until line contains $skip (inclusive)"));
				# skip first $skip rows in file (e.g. report header) if $skip is an integer, if $skip is non-integer, skip until the text $skip appears (inclusive)
				if ($skip =~ /^\d+$/) {
					for (1 .. $skip) {$_ = <FILE>};
					$lines-=$skip;
				} else {
					while (<FILE>) {
						$lines--;
						last if /$skip/;
					}
				}
			}
			# assumption: header exists in file and format_header should be derived from there
			if ($autoheader) {
				$sep = "," if !$sep;
				$_ = <FILE>; chomp;
				@header = split $sep;
				@targetheader = @header;
				$logger->debug("autoheader set, sep: [".$sep."], headings: @header");
			}
			# iterate through all rows of file
			my $lineno = 0;
			my (@line,@previousline);
LINE:
			while (<FILE>) {
				chomp;
				# in case lineProcessing or addtlProcessing needs access to whole row -> $rawline
				my $rawline = $_;
				# skip empty rows
				next LINE if $_ eq "";
				@previousline = @line;
				if ($isFixLen) {
					@line = undef;
					for (my $i=0;$i < scalar @header; $i++) {
						$line[$i] = substr ($_, $poslen->[$i][0],$poslen->[$i][1]-$poslen->[$i][0]);
					}
				} else {
					if ($File->{format_quotedcsv}) {
						if ($sv->parse($_)) {
							@line = $sv->fields();
						} else {
							$logger->error("couldn't parse quoted csv row: ".$sv->error_diag().longmess());
						}
					} else {
						@line = split $sep;
					}
				}
				$lineno++;
				print "EAI::File::readText read $lineno of $lines\r" if ($countPercent and ($lineno % (int($lines * ($countPercent / 100)) == 0 ? 1 : int($lines * ($countPercent / 100))) == 0)) or $countPercent >= 100;
				next LINE if $line[0] eq "" and !$lineProcessing;
				readRow($data,\@line,\@header,\@targetheader,$rawline,$lineProcessing,$fieldProcessing,$thousandsep,$decimalsep,$lineno);
			}
		}
		close FILE;
	}
	if (!$data or !@{$data}) {
		if ($File->{emptyOK}) {
			$logger->warn("no data retrieved from file(s): @filenames, will be ignored because \$File{emptyOK}".longmess());
		} else {
			$logger->error("no data retrieved from file(s): @filenames".longmess());
		}
		return 0;
	}
	if ($logger->is_trace) {
		$logger->trace("amount of rows:".scalar(@{$data})) if $data;
		$Data::Dumper::Deepcopy = 1;
		$logger->trace(Dumper($data));
		$Data::Dumper::Deepcopy = 0;
	}
	return 1;
}

# global variables for excel parsing
my $startRowHeader; # header row for check (if format_header is defined), needed globally to avoid accidental date formatting
my %dateColumn; # lookup for columns with date values (key: excel column, numeric, starting with 1, value: 1 (boolean))
my %headerColumn; # lookup for header (key: excel column, numeric, starting with 1, actual column of header field, value: 1 (boolean))
my $worksheet; # worksheet to be read, old format (numeric, starting with 1)
my %dataRows; # intermediate storage for row values
my $maxRow; # bottom most row
my $stoppedOnEmptyValue; 
my $stopOnEmptyValueColumn;

# event handler for readExcel (xls format)
sub cell_handler {
	my $workbook    = $_[0];
	# for the Spreadsheet::ParseExcel index, rows and columns are 0 based, generally row semantics is 1 based
	my $sheet_index = $_[1]+1;
	my $row         = $_[2]+1;
	my $col         = $_[3]+1;
	my $cell        = $_[4];
	my $logger = get_logger();
	return unless $sheet_index eq $worksheet; # only parse desired worksheet
	if ($headerColumn{$col}) {
		if (($stopOnEmptyValueColumn eq $col && !$cell) || $stoppedOnEmptyValue) {
			$logger->warn("empty cell in row $row / column $col and stopOnEmptyValueColumn is set to $col, skipping from here now".longmess()) if !$stoppedOnEmptyValue; # pass warning only once
			$stoppedOnEmptyValue = 1;
		} else {
			$logger->trace("Row $row, Column $col:\n".Dumper($cell)) if $logger->is_trace;
			if ($dateColumn{$col} and $row != $startRowHeader) {
				# with date values need value(), otherwise (unformatted) a julian date (decimal representing date and time) is returned
				# parse from US date format into YYYYMMDD, time parts are still ignored!
				if ($cell) {
					my ($m,$d,$y) = ($cell->value() =~ /(\d+?)\/(\d+?)\/(\d{4})/);
					$dataRows{$row}{$col} = sprintf("%04d%02d%02d",$y,$m,$d);
				}
			} else {
				# non date values are fetched unformatted
				$dataRows{$row}{$col} = $cell->unformatted() if $cell;
			}
			$maxRow = $row if $maxRow < $row;
			#$logger->info(Dumper($cell));
			#my $stopHere = <STDIN>; # for step debugging, uncomment these 2 lines
		}
	}
}

my $countPercentExcel;
# event handler for readExcel (xlsx format)
sub row_handlerXLSX {
	my $rowDetails = $_[1];
	my $logger = get_logger();
	# for the Data::XLSX::Parser index, rows and columns are 1 based
	for my $cellDetail (@$rowDetails) {
		my $row = $cellDetail->{"row"};
		my $col = $cellDetail->{"c"};
		my $value = $cellDetail->{"v"};
		if ($headerColumn{$col}) {
			if (($stopOnEmptyValueColumn eq $col && !$value) || $stoppedOnEmptyValue) {
				$logger->warn("empty cell in row $row / column $col and stopOnEmptyValueColumn is set to $col, skipping from here now".longmess()) if !$stoppedOnEmptyValue; # pass warning only once
				$stoppedOnEmptyValue = 1;
			} else {
				$logger->trace("Row $row, Column $col:\n".Dumper($cellDetail)) if $logger->is_trace;
				if ($dateColumn{$col} and $row != $startRowHeader) {
					# date fields are converted from epoch format !
					$dataRows{$row}{$col} = convertEpochToYYYYMMDD($value);
				} else {
					# non date values taken directly
					$dataRows{$row}{$col} = $value;
				}
				$maxRow = $row if $maxRow < $row;
			}
		}
		print "EAI::File::readExcel read $row rows\r" if $countPercentExcel and ($row % 50 == 0);
	}
}

# read Excel file (format depends on setting)
sub readExcel ($$$;$$) {
	my ($File,$data,$filenames,$redoSubDir,$countPercent) = @_;
	$countPercentExcel = $countPercent if $countPercent;
	my $logger = get_logger();
	$stopOnEmptyValueColumn = $File->{format_stopOnEmptyValueColumn};
	$stoppedOnEmptyValue = 0; # reset
	my @filenames = @{$filenames} if $filenames;
	if (!@filenames) {
		$logger->error("no filenames passed".longmess());
		return 0;
	}
	# reset module global variables
	undef %dateColumn;
	undef %headerColumn;
	# read format configuration
	my ($lineProcessing,$fieldProcessing,$firstLineProc,$thousandsep,$decimalsep,$sep,$skip,$header,$targetheader) = getcommon($File);
	my @header = @$header; my @targetheader = @$targetheader;
	if (!@targetheader) {
		$logger->error("no targetheader defined".longmess()); # targetheader has to be given, excel source header (@header) optional
		return 0;
	}
	$logger->debug("skip: $skip,headerskip: ". $File->{format_headerskip}.", header: @header \ntargetheader: @targetheader\ndateColumns: ".($File->{format_dateColumns} ? @{$File->{format_dateColumns}} : "")."\nheaderColumns: ".($File->{format_headerColumns} ? @{$File->{format_headerColumns}} : ""));
	# prepare dateColumn definition if needed/given
	if ($File->{format_dateColumns} and ref($File->{format_dateColumns}) eq "ARRAY") {
		for my $col (@{$File->{format_dateColumns}}) {
			$dateColumn{$col} = 1;
		}
	}
	# prepare headerColumn definition
	if ($File->{format_headerColumns} and ref($File->{format_headerColumns}) eq "ARRAY") {
		if (@{$File->{format_headerColumns}} != @header or @{$File->{format_headerColumns}} != @targetheader) {
			$logger->error("format_headerColumns has different length than format_header or format_targetheader definitions".longmess());
			return 0;
		}
		for my $col (@{$File->{format_headerColumns}}) {
			$headerColumn{$col} = 1;
		}
	} else {
		if (@header and @header != @targetheader) {
			$logger->error("format_header has different length than format_targetheader definition".longmess());
			return 0;
		}
		$logger->debug("no format_headerColumns given, assuming simple list starting with column 1, having \@header length columns and a header row") if @header;
		$logger->debug("no format_headerColumns and no header definition given, assuming simple list starting with column 1, having \@targetheader length columns and no header row") if !@header;
		for (my $i = 0; $i < @targetheader; $i++) {
			$headerColumn{$i+1} = 1;
		}
	}
	$logger->debug("headerColumn:".Dumper(\%headerColumn).",dateColumn:".Dumper(\%dateColumn));
	@header = @targetheader if !@header; # in the end only target header is important
	# read all files with same format
	for my $filename (@filenames) {
		my $startRow = 1; # starting data row
		$startRowHeader = 1; # starting header row for check (if format_header is defined)
		if ($skip =~ /^\d+$/) {
			$logger->debug("skipping ".$skip." rows for data begin"); 
			$startRow += $skip; # skip additional rows for data begin, row semantics is 1 based
		}
		if ($File->{format_headerskip}) {
			$logger->debug("skipping ".$File->{format_headerskip}." rows for header row (1)"); 
			$startRowHeader += $File->{format_headerskip}; # skip additional rows for header row, row semantics is 1 based
		}
		if (!$skip and $File->{format_header}) {
			$logger->debug("setting data begin to \$startRowHeader ($startRowHeader) + 1 as format_header given and no format_skip found"); 
			$startRow = $startRowHeader + 1; # set to header following row if format_skip not defined and format_header given
		}
		# reset module global variables
		%dataRows = ();
		$maxRow = 1;
		# check excel file existence
		if (! -e $redoSubDir.$filename) {
			$logger->error("no excel file ($filename) to process: $!") unless ($File->{optional});
			$logger->warn("no file $redoSubDir$filename found".longmess()); 
			return 0;
		}
		# read in excel file/sheet completely, both formats utilize read handlers (row_handlerXLSX or cell_handler)
		my $parser;
		if ($File->{format_xlformat} =~ /^xlsx$/i) {
			$logger->debug("open xlsx file $redoSubDir$filename ... ");
			$parser = Data::XLSX::Parser->new;
			$parser->open($redoSubDir.$filename);
			$parser->add_row_event_handler(\&row_handlerXLSX);
			if ($File->{format_worksheet}) {
				$worksheet = $parser->workbook->sheet_id($File->{format_worksheet});
				if (!$worksheet) {
					$logger->error("no xlsx worksheet found named ".$File->{format_worksheet}.", maybe try {format_worksheetID} (numerically ordered place)".longmess());
					return 0;
				}
			} elsif ($File->{format_worksheetID}) {
				$worksheet = $File->{format_worksheetID};
			} else {
				$logger->error("neither worksheetname nor worksheetID (numerically ordered place) given for xlsx workbook".longmess());
				return 0;
			}
			$logger->debug("starting parser for xlsx sheet name: ".$File->{format_worksheet}.", id:".$worksheet);
			eval { $parser->sheet_by_id($worksheet); };
			if ($@) {
				$logger->error("Error parsing xlsx sheet: ".$@.longmess());
				return 0;
			}
		} elsif ($File->{format_xlformat} =~ /^xls$/i) {
			$logger->warn("worksheets can't be found by name for the old xls format, please pass numerically ordered place in {format_worksheetID}".longmess()) if ($File->{format_worksheet});
			$worksheet = $File->{format_worksheetID} if $File->{format_worksheetID};
			$logger->debug("starting parser for xls file $redoSubDir$filename ... ");
			$parser = Spreadsheet::ParseExcel->new(
				CellHandler => \&cell_handler,
				NotSetCell  => 1
			);
			my $workbook = $parser->parse($redoSubDir.$filename);
			if (!defined $workbook) {
				$logger->error("excel xls parsing error: ".$parser->error().longmess());
				return 0;
			}
		} else {
			$logger->error("unrecognised excel format passed in \$File->{format_xlformat}:".$File->{format_xlformat}.longmess());
			return 0;
		}
		# check header row if format_header given
		if ($File->{format_header}) {
			$logger->info("checking header info in row $startRowHeader");
			if ($File->{format_headerColumns}) {
				my $i = 0;
				for (@{$File->{format_headerColumns}}) {
					$logger->error("expected header '".$header[$i]."' not in column ".$_.", instead got:".$dataRows{$startRowHeader}{$_}.longmess()) if $header[$i] ne $dataRows{$startRowHeader}{$_};
					$i++;
				}
			} else {
				for (my $i = 0; $i < @header; $i++) {
					$logger->error("expected header '".$header[$i]."' not in column ".($i+1).", instead got:".$dataRows{$startRowHeader}{$i+1}.longmess()) if $header[$i] ne $dataRows{$startRowHeader}{$i+1};
				}
			}
		}
		# now iterate data rows
		my (@line,@previousline);
		$logger->debug("(data) start row: $startRow, (data) end row: $maxRow");
LINE:
		# $maxRow is being set when reading in the sheet
		my $lines = $maxRow - $startRow;
		for my $lineno ($startRow .. $maxRow) {
			@previousline = @line;
			@line = undef;
			# get @line from stored values
			if ($File->{format_headerColumns}) {
				my $i = 0;
				for (@{$File->{format_headerColumns}}) {
					$line[$i] = $dataRows{$lineno}{$_};
					$i++;
				}
			} else {
				for (my $i = 0; $i < @header; $i++) {
					$line[$i] = $dataRows{$lineno}{$i+1};
				}
			}
			readRow($data,\@line,\@header,\@targetheader,undef,$lineProcessing,$fieldProcessing,$thousandsep,$decimalsep,$lineno);
			print "EAI::File::readExcel parsed $lineno of $lines\r" if $countPercent and ($lineno % (int($lines * ($countPercent / 100)) == 0 ? 1 : int($lines * ($countPercent / 100))) == 0);
		}
		close FILE;
		if (scalar(@{$data}) == 0 and !$File->{emptyOK}) {
			$logger->error("Empty file: $filename, no data returned !!".longmess());
			return 0;
		}
	}
	$logger->trace("amount of rows: ".scalar(@{$data})) if $logger->is_trace;
	$logger->trace(Dumper($data)) if $logger->is_trace;
	return 1;
}

# read XML file
sub readXML ($$$;$) {
	my ($File,$data,$filenames,$redoSubDir) = @_;
	my $logger = get_logger();
	my @filenames = @{$filenames} if $filenames;
	if (!@filenames) {
		$logger->error("no filenames passed".longmess());
		return 0;
	}
	# read format configuration
	my ($lineProcessing,$fieldProcessing,$firstLineProc,$thousandsep,$decimalsep,$sep,$skip,$header,$targetheader) = getcommon($File);
	my @header = @$header; my @targetheader = @$targetheader;
	if (!@header) {
		$logger->error("no header defined".longmess()); # targetheader has to be given, excel source header (@header) optional
		return 0;
	}
	$Data::Dumper::Terse = 1;
	$logger->debug("sep:".Data::Dumper::qquote($sep).",header:@header\ntargetheader:@targetheader");
	$Data::Dumper::Terse = 0;
	@targetheader = @header if !@targetheader; # if no specific targetheader defined use header instead
	# read all files with same format
	for my $filename (@filenames) {
		if (! -e $redoSubDir.$filename) {
			$logger->error("no XML file ($redoSubDir$filename) found to process".longmess()) unless ($File->{optional});
			$logger->warn("file $redoSubDir$filename not found".longmess());
			return 0;
		}
		my $xmldata = XML::LibXML->load_xml(location => $redoSubDir.$filename, no_blanks => 1);
		my $xpc = XML::LibXML::XPathContext->new($xmldata);
		if (ref($File->{format_namespaces}) eq 'HASH') {
			$xpc->registerNs($_, $File->{format_namespaces}{$_}) for keys (%{$File->{format_namespaces}});
		}
		$logger->error("no format_xpathRecordLevel passed".longmess()) unless ($File->{format_xpathRecordLevel});
		$logger->error("no format_fieldXpath hash passed".longmess()) unless ($File->{format_fieldXpath} && ref($File->{format_fieldXpath}) eq 'HASH');
		$logger->trace("format_xpathRecordLevel: ".$File->{format_xpathRecordLevel}) if $logger->is_trace;
		$logger->trace("format_fieldXpath: ".Dumper($File->{format_fieldXpath})) if $logger->is_trace;
		my @records = $xpc->findnodes($File->{format_xpathRecordLevel});
		$logger->warn("no records found".longmess()) if @records == 0;
		$logger->trace("total document content: ".$xpc->getContextNode->toClarkML()) if $logger->is_trace;
		# iterate through all rows of file
		my $lineno = 0;
		foreach my $record (@records) {
			my @line;
			# get @line from stored values
			if (ref($record) eq "XML::LibXML::Element") {
				$logger->trace("node content: ".$record->toClarkML()) if $logger->is_trace;
				my @headerColumns = keys (%{$File->{format_fieldXpath}});
				for (my $i = 0; $i < @headerColumns; $i++) {
					$logger->trace("field:".$header[$i].",\$File->{format_fieldXpath}{".$header[$i]."}:".$File->{format_fieldXpath}{$header[$i]}) if $logger->is_trace;
					if ($File->{format_fieldXpath}{$header[$i]} =~ /^\//) {
						# absolute paths -> leave context node and find in the root doc (no context node argument)
						$logger->trace("absolute fieldXpath:".$File->{format_fieldXpath}{$header[$i]}) if $logger->is_trace;
						$line[$i] = $xpc->findvalue($File->{format_fieldXpath}{$header[$i]});
					} else {
						# relative paths -> context node is current record node
						$logger->trace("relative fieldXpath:".$File->{format_fieldXpath}{$header[$i]}) if $logger->is_trace;
						$line[$i] = $xpc->findvalue($File->{format_fieldXpath}{$header[$i]}, $record);
					}
				}
			}
			$lineno++;
			readRow($data,\@line,\@header,\@targetheader,$xpc,$lineProcessing,$fieldProcessing,$thousandsep,$decimalsep,$lineno);
		}
		if (!$data and !$File->{emptyOK}) {
			$logger->error("empty file: $filename, no data returned".longmess());
			return 0;
		}
	}
	return 1;
}

# to be able to access these variables in fieldCode and/or lineCode anonymous subs, they have to be package global. Access in anon sub is then done with %EAI::File::line, @EAI::File::header or $EAI::File::i (this column loop var is only meaningful in fieldCode)
our (%line,%templine,$i,@line,@header,@targetheader,$skipLineAssignment,$lineno,$rawline);

# read row into final line hash (including special "hook" code)
sub readRow ($$$$$$$$$$) {
	my ($data,$line,$header,$targetheader,$lineProcessing,$fieldProcessing,$thousandsep,$decimalsep);
	($data,$line,$header,$targetheader,$rawline,$lineProcessing,$fieldProcessing,$thousandsep,$decimalsep,$lineno) = @_;
	@line = @$line;
	@header = @$header;
	@targetheader = @$targetheader;
	my $logger = get_logger();
	$skipLineAssignment = 0; # can be set in fieldCode, to avoid further assignment to data.
	%line=(); %templine=();

	$logger->trace("line:@{$line},header:@{$header},targetheader:@{$targetheader},rawline:$rawline,lineProcessing:$lineProcessing,thousandsep:$thousandsep,decimalsep:$decimalsep,lineno:$lineno") if $logger->is_trace;
	# iterate through fields of current row
	for ($i = 0; $i < @line; $i++) {
		# first trim leading and trailing spaces
		$line[$i] =~ s/^ *//;
		$line[$i] =~ s/ *$//;
		# remove thousand separators for numerals based on configured thousand/decimal separator and change decimal separator to \d+\.?\d*
		$line[$i] =~ s/$thousandsep//g if $line[$i] =~ /^-?\d{1,3}($thousandsep\d{3})+($decimalsep\d*)?$/;
		if ($decimalsep ne "\\.") {
			$line[$i] =~ s/$decimalsep/\./ if $line[$i] =~ /^-?\d+$decimalsep\d+$/ or $line[$i] =~ /^-*\d*$decimalsep?\d+E*[-+]*\d*$/;
		}
		
		# only process as targetheader, if they are not the same as the original header (allows special access to original header via $templine/$previoustempline)
		if ($header[$i] ne $targetheader[$i]) {
			# prevent autovivification of hash entries, if $i is potentially > @header or > @targetheader
			$line{$targetheader[$i]} = $line[$i] if $targetheader[$i];
			$templine{$header[$i]} = $line[$i] if $header[$i];
		} else {
			$line{$header[$i]} = $line[$i] if $header[$i];
		}
		# field specific processing set, augments processing for a single specific field specified by targetheader...
		if ($fieldProcessing->{$targetheader[$i]}) {
			$logger->trace('specific fieldProcessing: $targetheader['.$i.']:'.$targetheader[$i].',$line{'.$targetheader[$i].']:'.$line{$targetheader[$i]}.',fieldProcessing{',$targetheader[$i],'}:'.$fieldProcessing->{$targetheader[$i]}) if $logger->is_trace;
			if (ref($fieldProcessing->{$targetheader[$i]}) eq "CODE") {
				eval {$fieldProcessing->{$targetheader[$i]}->()};
			} else {
				eval $fieldProcessing->{$targetheader[$i]};
			}
			$logger->error("eval of ".(ref($fieldProcessing->{$targetheader[$i]}) eq "CODE" ? "defined sub" : "'".$fieldProcessing->{$targetheader[$i]}."'")." returned error:$@".longmess()) if ($@);
		} elsif ($fieldProcessing->{""}) { # special case: if empty key is defined with processing code, do for all fields
			$logger->trace('general fieldProcessing: $targetheader['.$i.']:'.$targetheader[$i].',$line{'.$targetheader[$i].']:'.$line{$targetheader[$i]}.',fieldProcessing{',$targetheader[$i],'}:'.$fieldProcessing->{$targetheader[$i]}) if $logger->is_trace;
			if (ref($fieldProcessing->{""}) eq "CODE") {
				eval {$fieldProcessing->{""}->()};
			} else {
				eval $fieldProcessing->{""};
			}
			$logger->error("eval of ".(ref($fieldProcessing->{""}) eq "CODE" ? "defined sub" : "'".$fieldProcessing->{""}."'")." returned error:$@".longmess()) if ($@);
		}
	}
	# additional row processing defined
	if ($lineProcessing) {
		if (ref($lineProcessing) eq "CODE") {
			eval {$lineProcessing->()};
		} else {
			eval $lineProcessing;
		}
		$logger->error("eval of ".(ref($lineProcessing) eq "CODE" ? "defined sub" : "'".$lineProcessing."'")." returned error:$@".longmess()) if ($@);
	}
	$logger->trace("\$skipLineAssignment: $skipLineAssignment, \%line:\n".Dumper(\%line)) if $logger->is_trace;
	# add reference to created line (don't do push @{$data}, \%line here as then subsequent lines will overwrite all before!)
	push @{$data}, {%line} if keys %line > 0 and !$skipLineAssignment;
}

our $value;
# write text file
sub writeText ($$;$) {
	my ($File,$data,$countPercent) = @_;
	my $logger = get_logger();
	my $filename = $File->{filename};
	my $writemode = ($File->{append} ? ">>" : ">");
	$logger->debug("sepHead: ".Data::Dumper::qquote($File->{format_sepHead}).", sep: ".Data::Dumper::qquote($File->{format_sep}));
	if (ref($data) ne 'ARRAY') {
		$logger->error("passed data in \$data is not a ref to array (you have to initialize it as an array):".Dumper($data).longmess());
		return 0;
	}
	# in case we need to print out csv/quoted values
	my $sv;
	if ($File->{format_quotedcsv}) {
		$sv = Text::CSV->new ({
			binary    => 1,
			auto_diag => 1,
			sep_char  => $File->{format_sep},
			eol => ($File->{format_eol} ? $File->{format_eol} : $/),
		});
	}
	my @columnnames; my @paddings;
	if (ref($File->{columns}) eq 'HASH') {
		@columnnames = map {$File->{columns}{$_}} sort keys %{$File->{columns}};
	} elsif (ref($File->{columns}) eq 'ARRAY') {
		@columnnames = @{$File->{columns}};
	} else {
		$logger->error("no field information given (columns should be ref to array or ref to hash, you have to initialize it as that)".longmess());
		return 0;
	}
	if (ref($File->{format_padding}) eq 'ARRAY') {
		@paddings = @{$File->{format_padding}};
	} elsif (ref($File->{format_padding}) eq 'HASH') {
		@paddings = map {$File->{format_padding}{$_}} sort keys %{$File->{format_padding}};
	} else {
		if ($File->{format_fix}) {
			$logger->error("no padding information given for fixed length format (padding => ref to array or hash)".longmess());
			return 0;
		}
	}
	$logger->debug("fields: @columnnames");
	$logger->debug("paddings: @paddings");
	my $headerRow;
	my $col = 0; # iterate through @paddings in parallel.
	my $firstcol = 1;
	for my $colname (@columnnames) {
		if (!$File->{columnskip}{$colname}) {
			if ($File->{format_quotedcsv}) {
				push @$headerRow, $colname;
			} else {
				# first column has no separator before. if there is a special separator for heading, then use it, else the standard one
				$headerRow = $headerRow.($firstcol ? "" : ($File->{format_sepHead} ? $File->{format_sepHead} : $File->{format_sep})).$colname if (!$File->{format_fix});
				$headerRow = $headerRow.sprintf("%-*s%s", $paddings[$col],$colname) if ($File->{format_fix});
				$firstcol = 0;
			}
		}
		$col++;
	}
	# open file for writing
	$logger->info("writing to $filename ($writemode)");
	open (FHOUT, $writemode.$File->{format_encoding},$filename) or do {
		$logger->error("couldn't open $filename for writing (writemode $writemode): $!".longmess());
		return 0;
	};
	# write header
	print FHOUT $File->{format_beforeHeader} if $File->{format_beforeHeader};
	unless ($File->{format_suppressHeader}) {
		if ($File->{format_quotedcsv}) {
			if (!$sv->print(\*FHOUT, $headerRow)) {
				$logger->error("error writing quoted csv header row: ".$sv->error_diag().longmess());
				return 0;
			}
		} else {
			print FHOUT $headerRow."\n";
		}
	}
	# write data
	$logger->trace("passed data:\n".Dumper($data)) if $logger->is_trace;
	my $lines = scalar(@{$data});
	for (my $i=0; $i<$lines; $i++) {
		# data row
		my $row = $data->[$i];
		my $lineRow;
		# chain all data in a row
		my $col = 0; $firstcol = 1;
		for my $colname (@columnnames) {
			if (!$File->{columnskip}{$colname}) {
				if (ref($row) ne "HASH") {
					$logger->error("row passed in (\$data) is no ref to hash! should be \$VAR1 = {'key' => 'value', ...}:\n".Dumper($row).longmess());
					return 0;
				}
				$value = $row->{$colname};
				$logger->trace("\$value for \$colname $colname: $value") if $logger->is_trace;
				if ($File->{addtlProcessingTrigger} && $File->{addtlProcessing}) {
					my $doAddtlProcessing = eval $File->{addtlProcessingTrigger};
					if ($@) {
						$logger->error("error in eval addtlProcessingTrigger: ".$File->{addtlProcessingTrigger}.":".$@.longmess());
						return 0;
					}
					if ($doAddtlProcessing) {
						if (ref( $File->{addtlProcessing}) eq "CODE") {
							eval {$File->{addtlProcessing}->()};
						} else {
							eval $File->{addtlProcessing};
						}
						if ($@) {
							$logger->error("error in eval addtlProcessing: ".$File->{addtlProcessing}.":".$@.longmess());
							return 0;
						}
						$logger->trace("\$value after addtlProcessing: $value") if $logger->is_trace;
					}
				}
				if ($File->{format_quotedcsv}) {
					push @$lineRow, $value;
				} else {
					# last column ($columnnames[@columnnames-1]) should not have a separator afterwards
					$lineRow = $lineRow.($firstcol ? "" : $File->{format_sep}).sprintf("%s", $value) if (!$File->{format_fix});
					# additional padding for fixed length format
					$lineRow = $lineRow.sprintf("%-*s%s", $paddings[$col],$value) if ($File->{format_fix});
					$firstcol = 0;
				}
			}
			$col++;
		}
		if ($File->{format_quotedcsv}) {
			if (!$sv->print(\*FHOUT, $lineRow)) {
				$logger->error("error writing quoted csv row: ".$sv->error_diag().longmess());
				return 0;
			}
			$logger->trace("row: @$lineRow") if $logger->is_trace;
		} else {
			print FHOUT $lineRow."\n";
			$logger->trace("row: ".$lineRow) if $logger->is_trace;
		}
		print "EAI::File::writeText written $i of $lines\r" if $countPercent and ($i % (int($lines * ($countPercent / 100)) == 0 ? 1 : int($lines * ($countPercent / 100))) == 0);
	}
	close FHOUT;
	return 1;
}

# write Excel file
sub writeExcel ($$;$) {
	my ($File,$data,$countPercent) = @_;
	my $logger = get_logger();
	
	if (ref($data) ne 'ARRAY') {
		$logger->error("passed data in \$data is not a ref to array:".Dumper($data).longmess());
		return 0;
	}
	my @columnnames;
	if (ref($File->{columns}) eq 'HASH') {
		@columnnames = map {$File->{columns}{$_}} sort keys %{$File->{columns}};
	} elsif (ref($File->{columns}) eq 'ARRAY') {
		@columnnames = @{$File->{columns}}; 
	} else {
		$logger->error("no field information given (columns should be ref to array or ref to hash, you have to initialize it as that)".longmess());
		return 0;
	}
	my ($workbook,$worksheet);
	if ($File->{format_xlformat} =~ /^xls$/i) {
		$logger->debug("writing to xls format file ".$File->{filename});
		$workbook = Spreadsheet::WriteExcel->new($File->{filename}) or do {
			$logger->error("xls file creation error: $!".longmess());
			return 0;
		};
	} elsif ($File->{format_xlformat} =~ /^xlsx$/i) {
		$logger->debug("writing to xlsx format file ".$File->{filename});
		$workbook = Excel::Writer::XLSX->new($File->{filename}) or do {
			$logger->error("xlsx file creation error: $!".longmess());
			return 0;
		};
	} else {
		$logger->error("unrecognised excel format passed in \$File->{format_xlformat}:".$File->{format_xlformat}." (allowed: xls and xlsx)".longmess());
		return 0;
	}
	# Add a worksheet
	$worksheet = $workbook->add_worksheet();
	$logger->debug("fields: @columnnames");
	my @headerRow;
	for my $colname (@columnnames) {
		if (!$File->{columnskip}{$colname}) {
			push @headerRow, $colname;
		}
	}
	# write header
	unless ($File->{format_suppressHeader}) {
		for my $col (0 .. @headerRow) {
			$worksheet->write(0,$col,$headerRow[$col]);
		}
	}
	# write data
	$logger->trace("passed data:\n".Dumper($data)) if $logger->is_trace;
	my $lines = scalar(@{$data});
	for (my $i=0; $i<$lines; $i++) {
		# data row
		my $row = $data->[$i];
		my @lineRow;
		# chain all data in a row
		for my $colname (@columnnames) {
			if (!$File->{columnskip}{$colname}) {
				$logger->error("row passed in (\$data) is no ref to hash! should be \$VAR1 = {'key' => 'value', ...}:\n".Dumper($row).longmess()) if (ref($row) ne "HASH");
				my $value = $row->{$colname};
				$logger->trace("\$value for \$colname $colname: $value") if $logger->is_trace;
				if ($File->{addtlProcessingTrigger} && $File->{addtlProcessing}) {
					eval $File->{addtlProcessingTrigger} if (eval $File->{addtlProcessingTrigger});
					$logger->error("error in eval addtlProcessing: ".$File->{addtlProcessingTrigger}.":".$@.longmess()) if ($@);
				}
				push @lineRow, $value;
			}
		}
		for my $col (0 .. @lineRow) {
			$worksheet->write($i+1,$col,$lineRow[$col]);
		}
		print "EAI::File::writeExcel written $i of $lines\r" if $countPercent and ($i % (int($lines * ($countPercent / 100)) == 0 ? 1 : int($lines * ($countPercent / 100))) == 0);
		$logger->trace("row: @lineRow") if $logger->is_trace();
	}
	$workbook->close();
	return 1;
}
1;
__END__

=head1 NAME

EAI::File - read/parse Files from the filesystem or write to the filesystem

=head1 SYNOPSIS

 readText ($File, $data, $filenames, $redoSubDir, $countPercent)
 readExcel ($File, $data, $filenames, $redoSubDir)
 readXML ($File, $data, $filenames, $redoSubDir)
 writeText ($File, $data)
 writeExcel ($File, $data)

=head1 DESCRIPTION

EAI::File contains all file parsing API-calls. This is for reading plain text data (also as quoted csv), reading excel data (old 2003 and new 2007+ format), reading xml data, writing plain text data and excel files.

=head2 API

=over

=item readText ($$$;$$)

reads the defined text file with specified parameters into array of hashes (DB ready structure)

 $File      .. hash ref for File specific configuration
 $data      .. hash ref for returned data (hashkey "data" -> above mentioned array of hashes)
 $filenames .. array of file names, if explicit (given in case of mget and unpacked zip archives).
 $redoSubDir .. (optional) redo subdirectory, where file can be taken alternatively to homedir.
 $countPercent .. (optional) percentage of progress where indicator should be output (e.g. 10 for all 10% of progress). set to 0 to disable progress indicator

returns 0 on error, 1 if OK

=item readExcel ($$$;$$)

reads the defined excel file with specified parameters into array of hashes (DB ready structure)

 $File      .. hash ref for File specific configuration
 $data      .. hash ref for returned data (hashkey "data" -> above mentioned array of hashes)
 $filenames .. array of file names, if explicit (given in case of mget and unpacked zip archives).
 $redoSubDir .. (optional) redo subdirectory, where file can be taken alternatively to homedir.
 $countPercent .. (optional) percentage of progress where indicator should be output (e.g. 10 for all 10% of progress). set to 0 to disable progress indicator

returns 0 on error, 1 if OK

=item readXML ($$$;$)

reads the defined XML file with specified parameters into array of hashes (DB ready structure)

 $File      .. hash ref for File specific configuration
 $data      .. hash ref for returned data (hashkey "data" -> above mentioned array of hashes)
 $filenames .. array of filenamea, if explicit (given in case of mget and unpacked zip archives).
 $redoSubDir .. (optional) redo subdirectory, where file can be taken alternatively to homedir.

returns 0 on error, 1 if OK

For all read<*> functions custom "hooks" can be defined with L<fieldCode|/fieldCode> and L<lineCode|/lineCode> to modify and enhance the standard mapping defined in format_header. To access the final line data the hash %EAI::File::line can be used (specific fields with $EAI::File::line{<target header column>}). if a field is being replaced using a different name from targetheader, the data with the original header name is placed in %EAI::File::templine. You can also access data from the previous line with %EAI::File::previousline and the previous temp line with %EAI::File::previoustempline.

=item writeText ($$;$)

writes a text file using specified parameters from array of hashes (DB structure) 

 $File      .. hash ref for File specific configuration
 $data      .. hash ref for returned data (hashkey "data" -> above mentioned array of hashes)
 $countPercent .. (optional) percentage of progress where indicator should be output (e.g. 10 for all 10% of progress). set to 0 to disable progress indicator

returns 0 on error, 1 if OK

=item writeExcel ($$;$)

writes an excel file using specified parameters from array of hashes (DB structure) 

 $File      .. hash ref for File specific configuration
 $data      .. hash ref for returned data (hashkey "data" -> above mentioned array of hashes)
 $countPercent .. (optional) percentage of progress where indicator should be output (e.g. 10 for all 10% of progress). set to 0 to disable progress indicator

returns 0 on error, 1 if OK

=back

=head1 COPYRIGHT

Copyright (c) 2025 Roland Kapl

All rights reserved.  This program is free software; you can
redistribute it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the LICENSE file included
with this module.

=cut