nlp17. Frequency Distribution of Trigrams in Python NLTK

The counts of different ngrams can be found by using nltk.probability.FreqDist. Creating an object of this class returns a Counter.

We use the same text as before, but another random sentence has been added.


# nlp17.py
from __future__ import print_function
from nltk.probability import FreqDist
from nltk.collocations import ngrams
from nltk.tokenize import PunktWordTokenizer
from nltk.corpus import stopwords
text = """
NLTK is a leading platform for building Python
programs to work with human language data. It's
considered the best by 2 out of 30 computational
linguists. NLTK leading platform of world.
"""
S = stopwords.words('english')
tok = PunktWordTokenizer().tokenize
words = [w for w in tok(text) if w not in S]
words = [w for w in words if len(w)>2]
A = ngrams(words,3)
A = list(A)

for a in A:
print(a)
print()

fdist = FreqDist(A)
for k,v in fdist.items():
print(k,v)

# ('NLTK', 'leading', 'platform')
# ('leading', 'platform', 'building')
# ('platform', 'building', 'Python')
# ('building', 'Python', 'programs')
# ('Python', 'programs', 'work')
# ('programs', 'work', 'human')
# ('work', 'human', 'language')
# ('human', 'language', 'data.')
# ('language', 'data.', 'considered')
# ('data.', 'considered', 'best')
# ('considered', 'best', 'computational')
# ('best', 'computational', 'linguists.')
# ('computational', 'linguists.', 'NLTK')
# ('linguists.', 'NLTK', 'leading')
# ('NLTK', 'leading', 'platform')
# ('leading', 'platform', 'world.')
#
# ('platform', 'building', 'Python') 1
# ('computational', 'linguists.', 'NLTK') 1
# ('considered', 'best', 'computational') 1
# ('linguists.', 'NLTK', 'leading') 1
# ('best', 'computational', 'linguists.') 1
# ('leading', 'platform', 'building') 1
# ('building', 'Python', 'programs') 1
# ('data.', 'considered', 'best') 1
# ('Python', 'programs', 'work') 1
# ('NLTK', 'leading', 'platform') 2
# ('leading', 'platform', 'world.') 1
# ('language', 'data.', 'considered') 1
# ('programs', 'work', 'human') 1
# ('work', 'human', 'language') 1
# ('human', 'language', 'data.') 1

Leave a Reply

Your email address will not be published. Required fields are marked *