View on GitHub

iwa2ki.github.io

n-gram

use strict;
my %ngrams;
while(<>){
  chomp;
  $_=~s/[^A-Za-z0-9_\- ]//g;
  $_=~s/^ +| +$//g;
  my @words=split / +/, lc $_;
  for(my $length=1; $length<=@words; $length++){
    for(my $start=0; $start+$length-1<@words; $start++){
      my $ngram=join(' ', @words[$start..$start+$length-1]);
      $ngrams{$ngram}++;
    }
  }
}
for(sort {$ngrams{$b}<=>$ngrams{$a}} keys %ngrams){
  print $_, "\t", $ngrams{$_}, "\n";
}