#!/usr/bin/perl use Data::Dumper; use File::Slurp qw/read_file/; my $tax = { 'audio video' => { 'sound audio', 'video', }, 'business enterprise' => { 'scheduling', 'office suites', 'e-commerce shopping', 'desktop publisinhg', 'report generators', 'knowledge management', 'enterprise', 'financial', 'todo lists', 'modelling', 'project management', 'time tracking', 'insurance', }, 'communications' => { 'chat', 'rss feed readers ', 'bbs', 'conferencing', 'email', 'fax', 'fido', 'ham radio', 'usenet news', 'internet phone', 'synchronization', 'streaming', 'telephony', 'file sharing', }, 'development' => { 'software development', 'database', 'text editors', 'data formats', }, 'home education' => { 'religion philosophy', 'education', 'printing', 'social sciences', }, 'games' => { 'hobbies', 'side-scrolling arcade', 'flight simulator', 'sports', 'mmorpg', 'puzzle', 'real time tactical', 'real time strategy', 'first person shooter', 'turn base strategy', 'role-playing', 'card games', 'multi user dungeon', 'console based', 'development framework', 'multiplayer', 'simulation', 'board games', }, 'graphics' => { 'capture', 'conversion', 'editors', '3d modeling', '3d rendering', 'fractals procedural generation', 'viewers', 'image galleries', 'presentation', 'handwriting recognition', 'animation', }, 'science engineering' => { 'chemistry', 'information analysis', 'interface engine', 'protocol translator', 'physics', 'artificial intelligence', 'astronomy', 'visualization', 'mapping', 'medical', 'mechanical civil engineering', 'human machine interfaces', 'medical physics', 'molecular mechanics', 'quantum computing', 'earth ', 'ecosystem ', 'test measurement ', 'molecular', 'building automation', 'simulators', 'robotics', 'scada', 'mathematics', 'linguistics', 'electronic design automation', 'bio-informatics', }, 'security utilities' => { 'archiving', 'file management', 'power ups', 'terminals', 'security', 'log rotation', 'file transfer protocol', 'capture', 'log analysis', }, 'system administration' => { 'software distribution', 'benchmark', 'boot', 'clustering', 'file system', 'emedded sustem', 'operation system kernel', 'cron scheduling', 'instalattion', 'logging', 'networking', 'power ups', 'home automation', 'os distribution', 'system shell', 'distributed computing', 'emulators', 'hardware', 'search', 'storage', } }; my $c = read_file '/tmp/full_plain_text.txt'; # first level my $last = 0; my $first; foreach my $um (keys %$tax) { my @words = (); if ($um =~ m/\s/) { @words = split /\s+/, $um; } else { push @words, $um; } foreach (keys %{$tax->{$um}}) { push @words, split /\s+/, $_; } my $curr = findAll(@words); if ($curr > $last) { $last = $curr; $first = $um } } # second level my $second; $last = 0; foreach my $dois (keys %{$tax->{$first}}) { my @words = (); if ($dois =~ m/\s/) { @words = split /\s+/, $dois; } else { push @words, $dois; } my $curr = findAll(@words); if ($curr > $last) { $last = $curr; $second = $dois; } } printf("%s\n", myformat($first)) if $first; printf(" +-- %s\n", myformat($second)) if $second; sub findAll { my @words = @_; my $found = 0; foreach (@words) { next unless ($_ =~ m/\w/); $found++ while ($c =~ m/\b${_}s?\b/ig); } return $found; } sub myformat { my $str = shift; my @l = split /\s+/, $str; @l = map(ucfirst, @l); return join(' ', @l); }