Tanggal :September 27, 2020

nlp17. Frequency Distribution of Trigrams in Python NLTK

Spread the love

The counts of different ngrams can be found by using nltk.probability.FreqDist. Creating an object of this class returns a Counter.

We use the same text as before, but another random sentence has been added.


# nlp17.py
from __future__ import print_function
from nltk.probability import FreqDist
from nltk.collocations import ngrams
from nltk.tokenize import PunktWordTokenizer
from nltk.corpus import stopwords
text = """
NLTK is a leading platform for building Python
programs to work with human language data. It's
considered the best by 2 out of 30 computational
linguists. NLTK leading platform of world.
"""
S = stopwords.words('english')
tok = PunktWordTokenizer().tokenize
words = [w for w in tok(text) if w not in S]
words = [w for w in words if len(w)>2]
A = ngrams(words,3)
A = list(A)

for a in A:
print(a)
print()

fdist = FreqDist(A)
for k,v in fdist.items():
print(k,v)

# ('NLTK', 'leading', 'platform')
# ('leading', 'platform', 'building')
# ('platform', 'building', 'Python')
# ('building', 'Python', 'programs')
# ('Python', 'programs', 'work')
# ('programs', 'work', 'human')
# ('work', 'human', 'language')
# ('human', 'language', 'data.')
# ('language', 'data.', 'considered')
# ('data.', 'considered', 'best')
# ('considered', 'best', 'computational')
# ('best', 'computational', 'linguists.')
# ('computational', 'linguists.', 'NLTK')
# ('linguists.', 'NLTK', 'leading')
# ('NLTK', 'leading', 'platform')
# ('leading', 'platform', 'world.')
#
# ('platform', 'building', 'Python') 1
# ('computational', 'linguists.', 'NLTK') 1
# ('considered', 'best', 'computational') 1
# ('linguists.', 'NLTK', 'leading') 1
# ('best', 'computational', 'linguists.') 1
# ('leading', 'platform', 'building') 1
# ('building', 'Python', 'programs') 1
# ('data.', 'considered', 'best') 1
# ('Python', 'programs', 'work') 1
# ('NLTK', 'leading', 'platform') 2
# ('leading', 'platform', 'world.') 1
# ('language', 'data.', 'considered') 1
# ('programs', 'work', 'human') 1
# ('work', 'human', 'language') 1
# ('human', 'language', 'data.') 1
Share

Leave a Reply

Your email address will not be published. Required fields are marked *