Group
Extension

Lucy/modules/analysis/snowstem/devel/update_snowstem.pl

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;
use File::Spec::Functions qw( catfile catdir no_upwards );
use File::Copy qw( copy );
use Cwd qw( getcwd );
use JSON::XS;

if ( @ARGV != 2 ) {
    die "Usage: perl update_snowstem.pl SNOWBALL_SVN_CO LUCY_SNOWSTEM_DIR";
}

my ( $snow_co_dir, $dest_dir ) = @ARGV;
die("Not a directory: '$snow_co_dir'") unless -d $snow_co_dir;

my $retval = system( "svn", "update", "-r", "541", $snow_co_dir );
die "svn update failed" if ( $retval >> 8 );

my $oldpwd = getcwd();
my $snow_build_dir = catdir( $snow_co_dir, 'snowball' );
chdir($snow_build_dir) or die $!;
$retval = system("make dist_libstemmer_c");
die "'make dist_libstemmer_c' failed" if ( $retval >> 8 );
chdir($oldpwd) or die $!;

# Copy only UTF-8 Stemmer files.  Keep directory structure intact so that
# compilation succeeds.
copy_dir_contents( 'src_c', qr/UTF/ );
copy_dir_contents('include');
copy_dir_contents('runtime');
copy_dir_contents( 'libstemmer', qr/utf8.[ch]$/ );

# Add include guard to libstemmer.h.
my $libstemmer_h_path
    = catfile( $dest_dir, qw( source include libstemmer.h ) );
open( my $libstemmer_h_fh, '<', $libstemmer_h_path )
    or die "Can't open '$libstemmer_h_path': $!";
my $libstemmer_h_content = do { local $/; <$libstemmer_h_fh> };
close $libstemmer_h_fh or die $!;
open( $libstemmer_h_fh, '>', $libstemmer_h_path )
    or die "Can't open '$libstemmer_h_path': $!";
print $libstemmer_h_fh <<END_STUFF;
#ifndef H_LIBSTEMMER
#define H_LIBSTEMMER

$libstemmer_h_content

#endif /* H_LIBSTEMMER */

END_STUFF

# Write tests.json file.  Only include 10 sample tests for each language to
# save space -- we assume that Snowball is thoroughly exercising its tests
# elsewhere.
my %languages = (
    en => 'english',
    da => 'danish',
    de => 'german',
    es => 'spanish',
    fi => 'finnish',
    fr => 'french',
    it => 'italian',
    nl => 'dutch',
    hu => 'hungarian',
    no => 'norwegian',
    pt => 'portuguese',
    ro => 'romanian',
    ru => 'russian',
    sv => 'swedish',
    tr => 'turkish',
);
my %tests;
for my $iso ( sort keys %languages ) {
    my $language   = $languages{$iso};
    my $words_path = catfile( $snow_co_dir, 'data', $language, 'voc.txt' );
    my $stems_path = catfile( $snow_co_dir, 'data', $language, 'output.txt' );
    open( my $words_fh, '<:encoding(UTF-8)', $words_path )
        or die "Can't open '$words_path': $!";
    open( my $stems_fh, '<:encoding(UTF-8)', $stems_path )
        or die "Can't open '$stems_path': $!";
    my @all_words = <$words_fh>;
    my @all_stems = <$stems_fh>;

    my @some_words;
    my @some_stems;
    my $interval = int( @all_words / 10 );
    for my $i ( 0 .. 9 ) {
        my $word = $all_words[ $i * $interval ];
        my $stem = $all_stems[ $i * $interval ];
        chomp($word);
        chomp($stem);
        die unless length($word) && length($stem);
        push @some_words, $word;
        push @some_stems, $stem;
    }
    $tests{$iso}{words} = \@some_words;
    $tests{$iso}{stems} = \@some_stems;
}
my $json_encoder    = JSON::XS->new->pretty(1)->canonical(1);
my $json            = $json_encoder->encode( \%tests );
my $tests_json_path = catfile( $dest_dir, 'source', 'test', 'tests.json' );
open( my $json_fh, '>:encoding(UTF-8)', $tests_json_path )
    or die "Can't open '$tests_json_path': $!";
print $json_fh $json;
close $json_fh or die $!;

# Write separate README file describing test.json's contents, since JSON is a
# commentless format.
my $readme_path = catfile( $dest_dir, 'source', 'test', 'README' );
open( my $readme_fh, '>:encoding(UTF-8)', $readme_path )
    or die "Can't open '$readme_path': $!";
print $readme_fh <<'END_STUFF';
The file 'tests.json' and this file were autogenerated by update_snowstem.pl.
'tests.json' contains materials from the Snowball project.  See the LICENSE
and NOTICE files for more information.
END_STUFF

sub copy_dir_contents {
    my ( $dir_name, $pattern ) = @_;
    my $from_dir = catdir( $snow_build_dir, $dir_name );
    my $to_dir = catdir( $dest_dir, 'source', $dir_name );
    opendir( my $dh, $from_dir )
        or die "Can't opendir '$from_dir': $!";
    die "Not a directory: '$to_dir'" unless -d $to_dir;
    for my $file ( no_upwards( readdir $dh ) ) {
        next if $pattern && $file !~ $pattern;
        next if $file =~ /\.svn/;
        my $from = catfile( $from_dir, $file );
        my $to   = catfile( $to_dir,   $file );
        copy( $from, $to ) or die "Can't copy '$from' to '$to': $!";
    }
    closedir $dh or die $!;
}



Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.