#!/usr/bin/perl use warnings; use strict; use Text::WordGrams; use Lingua::StopWords qw( getStopWords ); use DBI; use DBD::SQLite; use Conclave::Utils::ISplitter; use Data::Dumper; my $corpus = shift; die unless $corpus; my $dbfile = shift; my $pkgid = shift; my $ids; if ($dbfile and $pkgid) { my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile"); my $sth = $dbh->prepare("SELECT * FROM idtable WHERE pkgid=?"); $sth->execute($pkgid); while (my $row = $sth->fetchrow_hashref()) { $ids->{$row->{idname}}++; } } # compute word frequency my $data = word_grams_from_files({size=>1}, $corpus); my $stopwords = getStopWords('en'); foreach (keys %$data) { # remove stop words delete $data->{$_} if ($stopwords->{$_} or $stopwords->{lc $_}); # remove program ids delete $data->{$_} if ($ids and exists($ids->{$_})); } foreach (keys %$data) { delete $data->{$_} if $data->{$_} <= 2; } # clean up foreach (keys %$data) { # remover things that are not plain words delete $data->{$_} unless $_ =~ m/^[\w\d\-]+$/; delete $data->{$_} if $_ =~ m/^[\-\_]+$/; # remove length < 3 delete $data->{$_} if length($_) < 3; } my $splitter = Conclave::Utils::ISplitter->new('programming'); my $res; foreach my $k (keys %$data) { #my @l = $splitter->split($k); #foreach (@l) { # my $t = $_->{t}; # $res->{lc $t} += $data->{$k} if (length($t)>=3); #} $res->{lc $k} += $data->{$k}; } # print terms print "$_,$res->{$_}\n" foreach (keys %$res);