Yet Another CPAN Grep

Pg-Explain/lib/Pg/Explain.pm

package Pg::Explain;

# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
use v5.18;
use strict;
use warnings;
use warnings qw( FATAL utf8 );
use utf8;
use open qw( :std :utf8 );
use Unicode::Normalize qw( NFC );
use Unicode::Collate;
use Encode qw( decode );

if ( grep /\P{ASCII}/ => @ARGV ) {
    @ARGV = map { decode( 'UTF-8', $_ ) } @ARGV;
}

# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/

use Carp;
use Clone qw( clone );
use autodie;
use List::Util qw( sum );
use Pg::Explain::StringAnonymizer;
use Pg::Explain::FromText;
use Pg::Explain::FromYAML;
use Pg::Explain::FromJSON;
use Pg::Explain::FromXML;

=head1 NAME

Pg::Explain - Object approach at reading explain analyze output

=head1 VERSION

Version 2.9

=cut

our $VERSION = '2.9';

=head1 SYNOPSIS

Quick summary of what the module does.

Perhaps a little code snippet.

    use Pg::Explain;

    my $explain = Pg::Explain->new('source_file' => 'some_file.out');
    ...

    my $explain = Pg::Explain->new(
        'source' => 'Seq Scan on tenk1  (cost=0.00..333.00 rows=10000 width=148)'
    );
    ...


=head1 FUNCTIONS

=head2 source_format

What is the detected format of source plan. One of: TEXT, JSON, YAML, OR XML.

=head2 planning_time

How much time PostgreSQL spent planning the query. In milliseconds.

=head2 total_buffers

All buffers used by query - for planning and execution. Mathematically: sum of planning_buffers and top_level->buffers.

=head2 planning_buffers

How much buffers PostgreSQL used for planning. Either undef or object of Pg::Explain::Buffers class.

=head2 execution_time

How much time PostgreSQL spent executing the query. In milliseconds.

=head2 total_runtime

How much time PostgreSQL spent working on this query. This was part of EXPLAIN OUTPUT only for PostgreSQL 9.3 or older.

=head2 trigger_times

Information about triggers that were called during execution of this query. Array of hashes, where each hash can contains:

=over

=item * name - name of the trigger

=item * calls - how many times it was called

=item * time - total time spent in all executions of this trigger

=back

=head2 jit

Contains information about JIT timings, as object of Pg::Explain::JIT class.

If there was no JIT info, it will return undef.

=head2 query

What query this explain is for. This is available only for auto-explain plans. If not available, it will be undef.

=head2 settings

If explain contains information about specific settings that were changed in Pg, this hashref will contain it.

If there are none - if will be undef.

=cut

sub source_format    { my $self = shift; $self->{ 'source_format' }    = $_[ 0 ] if 0 < scalar @_; return $self->{ 'source_format' }; }
sub planning_time    { my $self = shift; $self->{ 'planning_time' }    = $_[ 0 ] if 0 < scalar @_; return $self->{ 'planning_time' }; }
sub planning_buffers { my $self = shift; $self->{ 'planning_buffers' } = $_[ 0 ] if 0 < scalar @_; return $self->{ 'planning_buffers' }; }
sub execution_time   { my $self = shift; $self->{ 'execution_time' }   = $_[ 0 ] if 0 < scalar @_; return $self->{ 'execution_time' }; }
sub total_runtime    { my $self = shift; $self->{ 'total_runtime' }    = $_[ 0 ] if 0 < scalar @_; return $self->{ 'total_runtime' }; }
sub trigger_times    { my $self = shift; $self->{ 'trigger_times' }    = $_[ 0 ] if 0 < scalar @_; return $self->{ 'trigger_times' }; }
sub jit              { my $self = shift; $self->{ 'jit' }              = $_[ 0 ] if 0 < scalar @_; return $self->{ 'jit' }; }
sub query            { my $self = shift; $self->{ 'query' }            = $_[ 0 ] if 0 < scalar @_; return $self->{ 'query' }; }
sub settings         { my $self = shift; $self->{ 'settings' }         = $_[ 0 ] if 0 < scalar @_; return $self->{ 'settings' }; }

sub total_buffers {
    my $self = shift;
    if ( $self->top_node->buffers ) {
        return $self->top_node->buffers + $self->planning_buffers if $self->planning_buffers;
        return $self->top_node->buffers;
    }
    return $self->planning_buffers if $self->planning_buffers;
    return;
}

=head2 add_trigger_time

Adds new information about trigger time.

It will be available at $node->trigger_times (returns arrayref)

=cut

sub add_trigger_time {
    my $self = shift;
    if ( $self->trigger_times ) {
        push @{ $self->trigger_times }, @_;
    }
    else {
        $self->trigger_times( [ @_ ] );
    }
    return;
}

=head2 runtime

How long did the query run. Tries to get the value from various sources (total_runtime, execution_time, or top_node->actual_time_last).

=cut

sub runtime {
    my $self = shift;

    return $self->total_runtime // $self->execution_time // $self->top_node->actual_time_last;
}

=head2 node

Returns node with given id from current explain.

If there is second argument present, and it's Pg::Explain::Node object, it sets internal cache for this id and this node.

=cut

sub node {
    my $self = shift;
    my $id   = shift;
    return unless defined $id;
    my $node = shift;
    $self->{ 'node_by_id' }->{ $id } = $node if defined $node;
    return $self->{ 'node_by_id' }->{ $id };
}

=head2 source

Returns original source (text version) of explain.

=cut

sub source {
    return shift->{ 'source' };
}

=head2 source_filtered

Returns filtered source explain.

Currently there are only two filters:

=over

=item * remove quotes added by pgAdmin3

=item * remove + character at the end of line, added by default psql config.

=back

=cut

sub source_filtered {
    my $self = shift;

    my $filtered = '';

    # use default variable, to avoid having to type variable name in all regexps below
    for ( split /\r?\n/, $self->source ) {

        # Remove separator lines from various types of borders
        next if /^\+-+\+\z/;
        next if /^[-─═]+\z/;
        next if /^(?:├|╟|╠|╞)[─═]+(?:┤|╢|╣|╡)\z/;

        # Remove more horizontal lines
        next if /^\+-+\+\z/;
        next if /^└─+┘\z/;
        next if /^╚═+╝\z/;
        next if /^┌─+┐\z/;
        next if /^╔═+╗\z/;

        # Remove frames around, handles |, ║, │
        s/^(\||║|│)(.*)\1\z/$2/;

        # Remove quotes around lines, both ' and "
        s/^(["'])(.*)\1\z/$2/;

        # Remove "+" line continuations
        s/\s*\+\z//;

        # Remove "↵" line continuations
        s/\s*↵\z//;

        # Remove "query plan" header
        next if /^\s*QUERY PLAN\s*\z/;

        # Remove rowcount
        next if /^\(\d+ rows?\)\z/;

        # Accumulate filtered source
        $filtered .= $_ . "\n";
    }

    return $filtered;
}

=head2 new

Object constructor.

Takes one of (only one!) (source, source_file) parameters, and either parses it from given source, or first reads given file.

=cut

sub new {
    my $class = shift;
    my $self  = bless {}, $class;
    my %args;
    if ( 0 == scalar @_ ) {
        croak( 'One of (source, source_file) parameters has to be provided)' );
    }
    if ( 1 == scalar @_ ) {
        if ( 'HASH' eq ref $_[ 0 ] ) {
            %args = @{ $_[ 0 ] };
        }
        else {
            croak( 'One of (source, source_file) parameters has to be provided)' );
        }
    }
    elsif ( 1 == ( scalar( @_ ) % 2 ) ) {
        croak( 'One of (source, source_file) parameters has to be provided)' );
    }
    else {
        %args = @_;
    }

    if ( $args{ 'source_file' } ) {
        croak( 'Only one of (source, source_file) parameters has to be provided)' ) if $args{ 'source' };
        $self->{ 'source_file' } = $args{ 'source_file' };
        $self->_read_source_from_file();
    }
    elsif ( $args{ 'source' } ) {
        if ( Encode::is_utf8( $args{ 'source' } ) ) {
            $self->{ 'source' } = $args{ 'source' };
        }
        else {
            $self->{ 'source' } = decode( 'UTF-8', $args{ 'source' } );
        }
    }
    else {
        croak( 'One of (source, source_file) parameters has to be provided)' );
    }

    # Initialize jit to undef
    $self->{ 'jit' } = undef;

    # Initialize node_by_id hash to empty
    $self->{ 'node_by_id' } = {};

    return $self;
}

=head2 top_node

This method returns the top node of parsed plan.

For example - in this plan:

                           QUERY PLAN
 --------------------------------------------------------------
  Limit  (cost=0.00..0.01 rows=1 width=4)
    ->  Seq Scan on test  (cost=0.00..14.00 rows=1000 width=4)

top_node is Pg::Explain::Node element with type set to 'Limit'.

Generally every output of plans should start with ->top_node(), and descend
recursively in it, using subplans(), initplans() and sub_nodes() methods.

=cut

sub top_node {
    my $self = shift;
    $self->parse_source() unless $self->{ 'top_node' };
    return $self->{ 'top_node' };
}

=head2 parse_source

Internally (from ->BUILD()) called function which checks which parser to use
(text, json, xml, yaml), runs appropriate function, and stores top level
node in $self->top_node.

=cut

sub parse_source {
    my $self = shift;

    my $source = $self->source_filtered;

    my $parser;

    if ( $source =~ m{^\s*<explain xmlns="http://www.postgresql.org/2009/explain">}m ) {

        # Format used by both explain command and autoexplain module
        $self->{ 'source_format' } = 'XML';
        $parser = Pg::Explain::FromXML->new();
    }
    elsif ( $source =~ m{ ^ \s* \[ \s* \{ \s* "Plan" \s* : \s* \{ }xms ) {

        # Format used by explain command
        $self->{ 'source_format' } = 'JSON';
        $parser = Pg::Explain::FromJSON->new();
    }
    elsif ( $source =~ m{ ^ \s* \{ \s* "Query \s+ Text" \s* : \s* ".*", \s* "Plan" \s* : \s* \{ .* \} \s* \z }xms ) {

        # Format used by autoexplain module
        $self->{ 'source_format' } = 'JSON';
        $parser = Pg::Explain::FromJSON->new();
    }
    elsif ( $source =~ m{ ^ \s* - \s+ Plan: \s* \n }xms ) {

        # Format used by explain command
        $self->{ 'source_format' } = 'YAML';
        $parser = Pg::Explain::FromYAML->new();
    }
    elsif ( $source =~ m{ ^ \s* Query \s+ Text: \s+ ".*" \s+ Plan: \s* \n }xms ) {

        # Format used by autoexplain module
        $self->{ 'source_format' } = 'YAML';
        $parser = Pg::Explain::FromYAML->new();
    }
    else {
        # Format used by both explain command and autoexplain module
        $self->{ 'source_format' } = 'TEXT';
        $parser = Pg::Explain::FromText->new();
    }

    $parser->explain( $self );

    $self->{ 'top_node' } = $parser->parse_source( $source );

    $self->check_for_parallelism();

    $self->check_for_exclusive_time_fixes();

    return;
}

=head2 check_for_exclusive_time_fixes

Certain types of nodes (CTE Scans, and InitPlans) can cause issues with "naive" calculations of node exclusive time.

To fix that whole tree will be scanned, and, if neccessary, node->exclusive_fix will be modified.

=cut

sub check_for_exclusive_time_fixes {
    my $self = shift;
    $self->check_for_exclusive_time_fixes_cte();
    $self->check_for_exclusive_time_fixes_init();
}

=head2 check_for_exclusive_time_fixes_cte

Modifies node->exclusive_fix according to times that were used by CTEs.

=cut

sub check_for_exclusive_time_fixes_cte {
    my $self = shift;

    # Safeguard against endless loop in some edge cases.
    return unless defined $self->{ 'top_node' };

    # There is no point in checking if the plan is not analyzed.
    return unless $self->top_node->is_analyzed;

    # Find nodes that have any ctes in them
    my @nodes_with_cte = grep { $_->ctes && 0 < scalar keys %{ $_->ctes } } ( $self->top_node, $self->top_node->all_recursive_subnodes );

    # For each node with cte in it...
    for my $node ( @nodes_with_cte ) {

        # Find all nodes that are 'CTE Scan' - from given node, and all of its subnodes (recursively)
        my @cte_scans = grep { $_->type eq 'CTE Scan' } ( $node, $node->all_recursive_subnodes );
        next if 0 == scalar @cte_scans;

        # Iterate over defined ctes
        while ( my ( $cte_name, $cte_node ) = each %{ $node->ctes } ) {

            # Find all CTE Scans that were scanning current CTE
            my @matching_cte_scans = grep { $_->scan_on->{ 'cte_name' } eq $cte_name } @cte_scans;
            next if 0 == scalar @matching_cte_scans;

            # How much time did Pg spend in given CTE itself
            my $cte_total_time = $cte_node->total_inclusive_time;

            # How much time did all the CTE Scans used
            my $total_time_of_scans = sum( map { $_->total_inclusive_time // 0 } @matching_cte_scans );

            # Don't fail on divide by 0, and don't warn on undef
            next unless $total_time_of_scans;
            next unless $cte_total_time;

            # Subtract exclusive time proportionally.
            for my $scan ( grep { $_->total_inclusive_time } @matching_cte_scans ) {
                $scan->exclusive_fix( $scan->exclusive_fix - ( $scan->total_inclusive_time / $total_time_of_scans ) * $cte_total_time );
            }
        }
    }
    return;
}

=head2 check_for_exclusive_time_fixes_init

Modifies node->exclusive_fix according to times that were used by InitScans.

=cut

sub check_for_exclusive_time_fixes_init {
    my $self = shift;

    # Safeguard against endless loop in some edge cases.
    return unless defined $self->{ 'top_node' };

    # There is no point in checking if the plan is not analyzed.
    return unless $self->top_node->is_analyzed;

    # Find nodes that have any init plans in them
    my @nodes_with_init = grep { $_->initplans && 0 < scalar @{ $_->initplans } } ( $self->top_node, $self->top_node->all_recursive_subnodes );

    # Check them all, one by one, to build "init-plan-visibility" info
    for my $parent ( @nodes_with_init ) {

        # Nodes that see what initplan returned even if they don't refer to returned $*
        my @all_implicits      = map { $_->id } ( $parent, $parent->all_recursive_subnodes );
        my %skip_self_implicit = ();

        # Scan all initplans
        for my $idx ( 0 .. $#{ $parent->initplans } ) {
            my $initnode = $parent->initplans->[ $idx ];

            # There is no point in adjusting things for no-time.
            next unless $initnode->total_inclusive_time;

            # Place to store implicit and explicit nodes
            my @implicitnodes = ();
            my @explicitnodes = ();

            my $explicit_re;

            # If there is metainfo, we can build regexp to find nodes explicitly using this init
            if ( $parent->initplans_metainfo->[ $idx ] ) {

                # List of $* variables that this initplan returns
                my $returns_string  = $parent->initplans_metainfo->[ $idx ]->{ 'returns' };
                my @returns_numbers = ();
                for my $element ( split /,/, $returns_string ) {
                    push @returns_numbers, $element if $element =~ s/\A\$(\d+)\z/$1/;
                }
                my $returns = join( '|', @returns_numbers );

                # Regular expression to check in extra-info for nodes.
                $explicit_re = qr{\$(?:${returns})(?!\d)};
            }

            # Add current node, and it's kids to skip list
            for my $skip_node ( $initnode, $initnode->all_recursive_subnodes ) {
                $skip_self_implicit{ $skip_node->id } = 1;
            }

            # Iterate over all nodes that could have used data from this initplan
            for my $user_id ( grep { !$skip_self_implicit{ $_ } } @all_implicits ) {

                my $user = $self->node( $user_id );

                # Add node to implicit ones,always
                push @implicitnodes, $user;

                # If there is explicit_re, try to find what is using this int explicitly
                next unless $explicit_re;
                next unless $user->extra_info;
                my $full_extra_info = join( "\n", @{ $user->extra_info } );
                push @explicitnodes, $user if $full_extra_info =~ $explicit_re;
            }

            # Total times
            my $implicittime = sum( map { $_->total_exclusive_time // 0 } @implicitnodes ) // 0;
            my $explicittime = sum( map { $_->total_exclusive_time // 0 } @explicitnodes ) // 0;

            # Where to adjusct exclusive time
            my @adjust_these = ();
            my $ratio;
            if (   ( 0 < scalar @explicitnodes )
                && ( $explicittime > $initnode->total_inclusive_time ) )
            {
                @adjust_these = @explicitnodes;
                $ratio        = $initnode->total_inclusive_time / $explicittime;
            }
            elsif ( $implicittime > $initnode->total_inclusive_time ) {
                @adjust_these = @implicitnodes;
                $ratio        = $initnode->total_inclusive_time / $implicittime;
            }

            # Actually adjust exclusive times
            for my $node ( @adjust_these ) {
                next unless $node->total_exclusive_time;
                my $adjust = $ratio * $node->total_exclusive_time;
                $node->exclusive_fix( $node->exclusive_fix - $adjust );
            }
        }
    }

    return;
}

=head2 check_for_parallelism

Handles parallelism by setting "force_loops" if plan is analyzed and there are gather nodes.

Generally, for each 

=cut

sub check_for_parallelism {
    my $self = shift;

    # Safeguard against endless loop in some edge cases.
    return unless defined $self->{ 'top_node' };

    # There is no point in checking if the plan is not analyzed.
    return unless $self->top_node->is_analyzed;

    # @nodes will contain list of nodes to check if they are Gather
    my @nodes = ( [ 1, $self->top_node ] );

    while ( my $node_info = shift @nodes ) {

        my $workers = $node_info->[ 0 ];
        my $node    = $node_info->[ 1 ];

        # Set workers.
        $node->workers( $workers );

        # These sub-nodes don't get workers.
        push @nodes, map { [ $workers, $_ ] } @{ $node->initplans }   if $node->initplans;
        push @nodes, map { [ $workers, $_ ] } @{ $node->subplans }    if $node->subplans;
        push @nodes, map { [ $workers, $_ ] } values %{ $node->ctes } if $node->ctes;

        # If there are workers launched, set it as new workers value for recursive set.
        $workers = 1 + $node->workers_launched if defined $node->workers_launched;

        # These things get new workers
        push @nodes, map { [ $workers, $_ ] } @{ $node->sub_nodes } if $node->sub_nodes;
    }
    return;
}

=head2 _read_source_from_file

Helper function to read source from file.

=cut

sub _read_source_from_file {
    my $self = shift;

    open my $fh, '<', $self->{ 'source_file' };
    local $/ = undef;
    my $content = <$fh>;
    close $fh;

    delete $self->{ 'source_file' };
    $self->{ 'source' } = $content;

    return;
}

=head2 as_text

Returns parsed plan back as plain text format (regenerated from in-memory structure).

This is mostly useful for (future at the moment) anonymizations.

=cut

sub as_text {
    my $self = shift;

    my $textual = $self->top_node->as_text();

    if ( $self->planning_buffers ) {
        $textual .= "Planning:\n";
        my $buf_info = $self->planning_buffers->as_text;
        $buf_info =~ s/^/  /gm;
        $textual .= $buf_info . "\n";

    }
    if ( $self->planning_time ) {
        $textual .= "Planning time: " . $self->planning_time . " ms\n";
    }
    if ( $self->trigger_times ) {
        for my $t ( @{ $self->trigger_times } ) {
            $textual .= sprintf( "Trigger %s: time=%.3f calls=%d\n", $t->{ 'name' }, $t->{ 'time' }, $t->{ 'calls' } );
        }
    }
    if ( $self->jit ) {
        $textual .= $self->jit->as_text();
    }
    if ( $self->execution_time ) {
        $textual .= "Execution time: " . $self->execution_time . " ms\n";
    }
    if ( $self->total_runtime ) {
        $textual .= "Total runtime: " . $self->total_runtime . " ms\n";
    }

    return $textual;
}

=head2 get_struct

Function which returns simple, not blessed, hashref with all information about the explain.

This can be used for debug purposes, or as a base to print information to user.

Output looks like this:

 {
     'top_node'               => {...}
     'planning_time'          => '12.44',
     'planning_buffers'       => {...},
     'execution_time'         => '12.44',
     'total_runtime'          => '12.44',
     'trigger_times'          => [
        { 'name' => ..., 'time' => ..., 'calls' => ... },
        ...
     ],
 }

=cut

sub get_struct {
    my $self  = shift;
    my $reply = {};
    $reply->{ 'top_node' }         = $self->top_node->get_struct;
    $reply->{ 'planning_time' }    = $self->planning_time                if $self->planning_time;
    $reply->{ 'planning_buffers' } = $self->planning_buffers->get_struct if $self->planning_buffers;
    $reply->{ 'execution_time' }   = $self->execution_time               if $self->execution_time;
    $reply->{ 'total_runtime' }    = $self->total_runtime                if $self->total_runtime;
    $reply->{ 'trigger_times' }    = clone( $self->trigger_times )       if $self->trigger_times;
    $reply->{ 'query' }            = $self->query                        if $self->query;
    $reply->{ 'settings' }         = $self->settings                     if $self->settings;

    if ( $self->jit ) {
        $reply->{ 'jit' }                  = {};
        $reply->{ 'jit' }->{ 'functions' } = $self->jit->functions;
        $reply->{ 'jit' }->{ 'options' }   = clone( $self->jit->options );
        $reply->{ 'jit' }->{ 'timings' }   = clone( $self->jit->timings );
    }
    return $reply;
}

=head2 anonymize

Used to remove all individual values from the explain, while still retaining
all values that are needed to see what's wrong.

If there are any arguments, these are treated as strings, anonymized using
anonymizer used for plan, and are returned in the same order.

This is mainly useful to anonymize queries.

=cut

sub anonymize {
    my $self       = shift;
    my @extra_args = @_;

    my $anonymizer = $self->{ 'anonymizer' };
    if ( !$anonymizer ) {
        $anonymizer = Pg::Explain::StringAnonymizer->new();
        $self->top_node->anonymize_gathering( $anonymizer );
        $anonymizer->finalize();
        $self->top_node->anonymize_substitute( $anonymizer );
        $self->{ 'anonymizer' } = $anonymizer;
    }

    return if 0 == scalar @extra_args;

    return $anonymizer->anonymize_text( $extra_args[ 0 ] ) if 1 == scalar @extra_args;

    return map { $anonymizer->anonymize_text( $_ ) } @extra_args;
}

=head1 AUTHOR

hubert depesz lubaczewski, C<< <depesz at depesz.com> >>

=head1 BUGS

Please report any bugs or feature requests to C<depesz at depesz.com>.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Pg::Explain

=head1 COPYRIGHT & LICENSE

Copyright 2008-2023 hubert depesz lubaczewski, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.


=cut

1;    # End of Pg::Explain