Algorithm-LDA/lib/Algorithm/LDA.pm
# Algorithm::LDA
#
# Perl implementation of an example module
#
# Copyright (c) 2016
#
# Bridget T McInnes, Virginia Commonwealth University
# bmcinnes at vcu.edu
#
# Nicholas Jordan, Virginia Commonwealth University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
=head1 NAME
Algorithm::LDA
=head1 SYNOPSIS
use Algorithm::LDA;
my $lda = new Algorithm::LDA("Data", 5, 100, 100, 0, 10, 0.1, 10, "stoplist.txt");
=head1 DESCRIPTION
Algorithm::LDA is an implementation of Latent Dirichlet Allocation in Algorithm
=cut
package Algorithm::LDA;
use strict;
use 5.006;
use strict;
use warnings FATAL => 'all';
use constant pi => 4*atan2(1, 1);
use constant e => exp(1);
use parent qw/Class::Accessor::Fast/;
use List::Util qw(shuffle sum max);
use List::MoreUtils qw(uniq first_index);
use JSON::XS;
use vars qw($VERSION);
$VERSION = '0.03';
#Used for accessing $self->documents
__PACKAGE__->mk_accessors(qw/documents/);
# $documents - Data directory (TXT files)
# $stop - Stopword list (regex)
# $K - Number of Topics
# $k - $K-1 (for convenience)
# %vocabulary - hashmap containing words and IDs
# @words - array containing all words
# @documents - array of arrays of words in each document
# Doc1 = word1, word2, word3
# Doc2 = word4, word5, word6
# %map - hashmap used for getting word frequencies
# $V - vocabulary size
# $v - $V-1 (for convenience)
# @alpha - array of alpha values (parameter of topic distribution)
# @theta - array of theta values (topic distribution)
# @beta - array of beta values (parameter of word distribution)
# @phi - array of phi values (word distribution)
# $totalDocs - Total Documents (Only used for computing completeness when loading)
# $maxIterations - Maximum Iterations
# $updateCorpus - 1 = Force update documents, 0 = allow loading from JSON
# $threshold - Minimum number of documents a word must appear in
# $numWords - Number of words per topic
# $alpha - Default alpha value
# $documentNum - Number of documents
my $data;
my $docs;
my $stop;
my $K;
my $k;
my %vocabulary;
my @words;
my @documents;
my %map = ();
my $V;
my $v;
my @alpha;
my @theta;
my @beta;
my @phi;
my $totalDocs;
my $maxIterations;
my $updateCorpus;
my $threshold;
my $numWords;
my $alpha;
my $documentNum = 0;
my $self;
sub new
{
my $class = shift;
$self = {
_data => shift,
_numTopics => shift,
_maxIterations => shift,
_totalDocs => shift,
_updateCorpus => shift,
_wordThreshold => shift,
_alpha => shift,
_numWords => shift,
_stop => shift,
docs => [],
document_topic_map => {},
topic_word_map => {},
document_map => {},
topic_map => {},
word_map => {},
};
$docs = $self->{_data};
$data = $self->{_data};
$K = $self->{_numTopics};
$k = $K - 1;
$maxIterations = $self->{_maxIterations};
$totalDocs = $self->{_totalDocs};
$updateCorpus = $self->{_updateCorpus};
$threshold = $self->{_wordThreshold};
$alpha = $self->{_alpha};
$numWords = $self->{_numWords};
$stop = $self->{_stop};
@{$self->{documents}} = ();
bless $self, $class;
init();
return $self;
}
=head3 add
description:
Used to add to array of documents ($self->documents)
input:
%args <- hash containing data
output:
1
example:
while (my $line = <$fh2>) {
my $obj = decode_json($line);
add(%$obj);
}
=cut
#Used to add to array of documents ($self->documents)
#Adds a word with document ID and random topic
sub add
{
my (%args) = @_;
return unless (valid($args{data}));
my $document_id = @{$self->documents};
my @data_list = map {
{ document => $document_id, topic => int(rand($K)), word => $_ }
} @{$args{data}};
for my $data (@data_list)
{
$self->increaseMap($document_id, $data->{topic}, $data->{word});
}
push(@{$self->documents}, \@data_list);
return 1;
}
=head3 init
description:
Initializes alpha, initializes beta, loads documents, starts main loop
input:
None
output:
1
example:
init();
=cut
#Initialization Method
sub init
{
#Load Documents
load();
#Initialize @alpha to default value
$alpha[$_] = $alpha for(0..$k);
#Randomly initialize beta distribution
beta();
#Start Main loop
for my $iter (1..$maxIterations)
{
#Calculate and print percentage completed
my $a = $iter * 100 / $maxIterations;
print "Iteration: $iter | $a% Completed...\n";
#Shuffle Documents
@{$self->documents} = shuffle(@{$self->documents});
#Loop through each word in each document and sample its topic
for my $document (@{$self->documents})
{
print STDERR "Processing Document $document\n";
for my $data (@$document)
{
$self->decreaseMap($data->{document}, $data->{topic}, $data->{word});
$data->{topic} = $self->sample_topic($data->{document}, $data->{word});
$self->increaseMap($data->{document}, $data->{topic}, $data->{word});
}
}
#print results for this iteration
printResults($iter);
}
return 1;
}
=head3 printResults
description:
Prints words in each topic, topics in each document, phi values,
and theta values to text files in the 'Results/$data' directory
input:
None
output:
None
example:
printResults();
=cut
#Creates four files in "Results/$data"
# Documents.$data.txt - topic distribution for each document
# Topics.$data.txt - word distribution for each topic
# phi.$data.txt - Phi values per topic
# theta.$data.txt Theta values per document
sub printResults
{
print STDERR "Printing Results\n";
my $iter = shift;
if(! (-e "Results")) {
system "mkdir Results";
}
if(! (-e "Results/$iter") ) {
system "mkdir Results/$iter";
}
my $file = "Results/" . $iter . "/Topics." . $iter . ".txt";
open(my $fh, '>', $file) or die "Could not open file '$file' $!";
my $file2 = "Results/" . $iter . "/Documents." . $iter . ".txt";
open(my $fh2, '>', $file2) or die "Could not open file '$file2' $!";
for my $topic (0 .. $k)
{
my $words_on_topic = wordsPerTopic(topic => $topic);
splice(@$words_on_topic, $numWords);
print $fh join("\n", "Topic[$topic]:\n", map { "$_->{word}\t$_->{prob}"; } @$words_on_topic)."\n\n\n";
}
for my $doc (0 .. $#documents) {
my $topics_on_document= topicsPerDocument(document => $doc);
splice(@$topics_on_document, $numWords);
print $fh2 join("\n", "Document[$doc]:\n", map { "$_->{topic}\t$_->{prob}"; } @$topics_on_document)."\n\n\n";
}
close($fh);
close($fh2);
my $file3 = "Results/" . $iter . "/phi." . $iter . ".txt";
open(my $fh3, '>', $file3) or die "Could not open file '$file3' $!";
my $file4 = "Results/" . $iter . "/theta." . $iter . ".txt";
open(my $fh4, '>', $file4) or die "Could not open file '$file4' $!";
for my $i (0..$k)
{
print $fh3 "$i : " . join(", ", @{$phi[$i]}) . "\n";
}
for my $i (0..scalar @{$self->documents} - 1)
{
print $fh4 "$i : " . join(", ", @{$theta[$i]}) . "\n";
}
close($fh3);
close($fh4);
}
=head3 load
description:
Loads documents from text files (in "data/$data") or JSON file (in "Documents")
input:
None
output:
None
example:
load();
=cut
#Loads document data from files or JSON
sub load
{
#open data directory
opendir(DH, "$docs");
my @files = grep { $_ ne '.' and $_ ne '..' } readdir DH;
closedir(DH);
#array holding string of words in each document
my @documents1 = ();
#stopword regex
my $regex = "";
if(defined $stop) {
my $rstop = stop();
$regex = qr/($rstop)/;
}
#Load Files from TXTs
print "Loading Documents from TXTs...\n";
foreach my $filename (@files)
{
print "Loading Document $documentNum ($filename): Corpus ";
print (($documentNum + 1) * 100 / $totalDocs);
print "% completed...\n";
open(FILE, "$docs/$filename")
|| die "Could not open file '$docs/$filename' $!";
#Load file into single string, remove stopwords, and split by whitespace into array
my $document = do { local $/; lc(<FILE>)};
$document =~ s/($regex)//g;
my @temp = split(/\s+/, $document);
#remove all special characters and add to @words and @documents1
for my $i (0..scalar @temp - 1)
{
$temp[$i] = removeSpecialChars($temp[$i]);
}
push(@words, @temp);
$documents1[$documentNum] = join(" ", @temp);
$documentNum++;
}
chomp @words;
@words = grep {$_ if $_ } @words;
%vocabulary = map { $words[$_]=>$_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
#Loop through @documents1, remove special characters and populate @documents
for my $text (@documents1)
{
my @ws = split(/\s+/, $text);
chomp @ws;
@ws = map { removeSpecialChars($_) } @ws;
@ws = grep { exists $vocabulary{$_} } @ws;
@ws = uniq(@ws);
push @documents, \@ws;
}
print "Vocabulary (Uncleaned): $V\n";
#Get word frequencies
for my $d (0..$#documents)
{
for my $wrd (@{$documents[$d]})
{
next unless exists $vocabulary{$wrd};
$map{$wrd}=0 unless exists $map{$wrd};
$map{$wrd}++;
}
}
#Remove words that appear in more than half of the corpus, and less than $threshold documents
#Also remove words of less than three letters
my $D = @documents;
for my $wd (0..$#words)
{
my $times = $map{$words[$wd]};
my $test = ($times > 0.5*$D || $times<=$threshold || length($words[$wd]) <=3);
if($test)
{
$words[$wd]=0;
}
}
#Repopulate %vocabulary with cleaned words
@words = grep { $_ } (@words);
@words = uniq(@words);
%vocabulary = map { $words[$_] => $_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
print "Vocabulary (Cleaned): $V\n";
#Convert words to hashmap (for use of "exists") and remove unclean
# words from documents array
my %h;
@h{@words} = ();
for my $i (0..$#documents)
{
@{$documents[$i]} = grep{exists $h{$_}} @{$documents[$i]};
}
open(my $fh, '>', "JSON") or die "Could not open file 'JSON' $!";
foreach my $i (@documents)
{
print $fh "{\"data\":[\"" . join("\", \"", @{$i})."\"]}\n";
}
close $fh;
open(my $fh2, '<', "JSON") or die "Could not open file 'JSON' $!";
while (my $line = <$fh2>) {
my $obj = decode_json($line);
add(%$obj);
}
close $fh2;
}
=head3 wordsPerTopic
description:
Creates an array of words in each topic
input:
%args -> hash containing topic
output:
@words -> Array containing words and probabilities (phi value) for $args{topic}
example:
my $words_on_topic = wordsPerTopic(topic => $topic);
=cut
sub wordsPerTopic
{
my (%args) = @_;
return unless (defined $args{topic});
my @words = sort { $b->{prob} <=> $a->{prob} } map {
{ word => $_, prob => $self->computePhi($args{topic}, $_) }
} keys %{$self->{word_map}};
return \@words;
}
=head3 topicsPerDocument
description:
Creates an array of topics in each document
input:
%args -> hash containing document
output:
@topics -> Array containing topics and probabilities (theta value) for $args{document}
example:
my $topics_on_document= topicsPerDocument(document => $doc);
=cut
sub topicsPerDocument
{
my (%args) = @_;
return unless (defined $args{document});
my @topics = sort { $b->{prob} <=> $a->{prob} } map {
{ topic => $_, prob => $self->computeTheta($args{document}, $_) }
} keys %{$self->{topic_map}};
return \@topics;
}
=head3 sample_topic
description:
Uses Gibbs Sampling to determine a topic given a document and word
input:
$document -> ID of document word is in
$word -> word that is to be evaluated
output:
$topic -> topic ID
$k -> last topic if topic can't be found
example:
my $topics_on_document= topicsPerDocument(document => $doc);
=cut
sub sample_topic
{
my ($self, $document, $word) = @_;
my @dists;
my $dist = 0.0;
for my $topic (0 .. $k)
{
$dist += ($self->computePhi($topic, $word) * $self->computeTheta($document, $topic));
$phi[$topic][first_index { $_ eq $word } @words] = $self->computePhi($topic, $word);
$theta[$document][$topic] = $self->computeTheta($document, $topic);
push(@dists, $dist);
}
my $sampled_dist = rand($dist);
for my $topic (0 .. $k)
{
return $topic if ($sampled_dist < $dists[$topic]);
}
return ($k);
}
=head3 computePhi
description:
Computes the expected phi value for a word given a topic ID
input:
$topic -> ID of topic (iteration 0..$k)
$word -> word that is to be evaluated
output:
Phi value
example:
$dist += ($self->computePhi($topic, $word) * $self->computeTheta($document, $topic));
=cut
sub computePhi
{
my ($self, $topic, $word) = @_;
$self->{topic_word_map}{$topic}{$word} //= 0.0;
$self->{topic_map}{$topic} //= 0.0;
#print $vocabulary{$word} . " | ";
#print first_index { $_ eq $word } @words;
#print "\n";
return ($self->{topic_word_map}{$topic}{$word} + $beta[$topic][first_index { $_ eq $word } @words]) /
($self->{topic_map}{$topic} + $V * $beta[$topic][first_index { $_ eq $word } @words]);
}
=head3 computeTheta
description:
Computes the expected theta value for a topic given a document ID
input:
$document -> ID of document
$topic -> ID of topic (iteration 0..$k)
output:
Theta value
example:
$dist += ($self->computePhi($topic, $word) * $self->computeTheta($document, $topic));
=cut
sub computeTheta
{
my ($self, $document, $topic) = @_;
$self->{document_topic_map}{$document}{$topic} //= 0.0;
$self->{document_map}{$document} //= 0.0;
return ($self->{document_topic_map}{$document}{$topic} + $alpha[$topic]) /
($self->{document_map}{$document} + $K * $alpha[$topic]);
}
=head3 increaseMap
description:
Increases the values of all of the hashmaps
input:
$document -> ID of document
$topic -> ID of topic
$word -> word in document $document
output:
None
example:
$self->increaseMap($data->{document}, $data->{topic}, $data->{word});
=cut
sub increaseMap
{
my ($self, $document, $topic, $word) = @_;
$self->{document_topic_map}{$document}{$topic}++;
$self->{topic_word_map}{$topic}{$word}++;
$self->{document_map}{$document}++;
$self->{topic_map}{$topic}++;
$self->{word_map}{$word}++;
}
=head3 decreaseMap
description:
Decreases the values of all of the hashmaps
input:
$document -> ID of document
$topic -> ID of topic
$word -> word in document $document
output:
None
example:
$self->decreaseMap($data->{document}, $data->{topic}, $data->{word});
=cut
sub decreaseMap
{
my ($self, $document, $topic, $word) = @_;
$self->{document_topic_map}{$document}{$topic}--;
$self->{topic_word_map}{$topic}{$word}--;
$self->{document_map}{$document}--;
$self->{topic_map}{$topic}--;
$self->{word_map}{$word}--;
}
=head3 valid
description:
Returns whether or not $data is a valid array (able to be added to the dataset)
input:
$data -> data to be evaluated
output:
Boolean/Integer -> true/1 - $data is an array | false/0 - $data is not an array;
example:
return unless (valid($args{data}));
=cut
sub valid
{
my ($data) = @_;
return unless ($data);
return (ref($data) eq 'ARRAY') ? 1 : 0;
}
=head3 removeSpecialChars
description:
Removes special characters from a word (non-ascii/non-letter characters)
input:
$word -> word to be cleaned
output:
$newWord -> $word without non-ascii/non-letter characters
example:
@ws = map { removeSpecialChars($_) } @ws;
=cut
sub removeSpecialChars
{
my($word) = @_;
$word =~ s/([^\w\d])+?//g;
$word =~ s/[^[:ascii:]]//g;
my $newWord = lc($word);
return $newWord;
}
=head3 beta
description:
Randomly initializes beta values
input:
None
output:
None
example:
beta();
=cut
sub beta
{
my $e_value = 1.0 / $K;
for my $i (0..$k)
{
for my $n (0..$v)
{
$beta[$i][$n] = $e_value;
}
}
for(1..1000000)
{
my $d = rand()* $e_value;
my $i = int(rand($K));
my $n1 = int(rand($V));
my $n2 = int(rand($V));
$beta[$i][$n1]+=$d;
$beta[$i][$n2]-=$d;
if($beta[$i][$n2] <= 0 || $beta[$i][$n1] >=1)
{
$beta[$i][$n1]-=$d;
$beta[$i][$n2]+=$d;
}
}
}
=head3 stop
description:
Stopword subroutine. Generates a regex to remove words in a stopword list
input:
None
output:
$stop_regex -> regex containing stopwords
example:
my $stop = stop();
my $regex = qr/($stop)/;
=cut
#STOPWORD SUBROUTINE
sub stop
{
my $stop_regex = "";
my $stop_mode = "AND";
open ( STP, $stop ) ||
die ("Couldn't open the stoplist file $stop\n");
while ( <STP> )
{
chomp;
if(/\@stop.mode\s*=\s*(\w+)\s*$/)
{
$stop_mode=$1;
if(!($stop_mode=~/^(AND|and|OR|or)$/))
{
print STDERR "Requested Stop Mode $1 is not supported.\n";
exit;
}
next;
}
# accepting Perl Regexs from Stopfile
s/^\s+//;
s/\s+$//;
#handling a blank lines
if(/^\s*$/) { next; }
#check if a valid Perl Regex
if(!(/^\//))
{
print STDERR "Stop token regular expression <$_> should start with '/'\n";
exit;
}
if(!(/\/$/))
{
print STDERR "Stop token regular expression <$_> should end with '/'\n";
exit;
}
#remove the / s from beginning and end
s/^\///;
s/\/$//;
#form a single big regex
$stop_regex.="(".$_.")|";
}
if(length($stop_regex)<=0)
{
print STDERR "No valid Perl Regular Experssion found in Stop file $stop";
exit;
}
chop $stop_regex;
# making AND a default stop mode
if(!defined $stop_mode)
{
$stop_mode="AND";
}
close STP;
return $stop_regex;
}
1;
__END__
=head1 REFERENCING
If you have a reference paper for this module put it here in bibtex form
=head1 CONTACT US
If you have any trouble installing and using <module name>
please contact us via :
Bridget T. McInnes: btmcinnes at vcu.edu
=head1 SEE ALSO
Additional modules associated with the package
=head1 AUTHORS
Nick Jordan, Virginia Commonwealth University
Bridget McInnes, Virginia Commonwealth University
=head1 COPYRIGHT AND LICENSE
Copyright 2016 by Bridget McInnes, Nicholas Jordan
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to
The Free Software Foundation, Inc.,
59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
=cut