#### Remove punctuation and numbers.
import string
translator = str.maketrans('', '', string.punctuation + string.digits)
text = text.translate(translator)
text[:100]
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')
print(en_stopwords[:10])
en_stopwords = set(en_stopwords)
words = text.lower().split()
words[:10]
words = [w for w in words if w not in en_stopwords and len(w) > 3]
words[:10]
bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])
bigrams[:3]
ug_fdist = nltk.FreqDist(words)
bg_fdist = nltk.FreqDist(bigrams)
ug_fdist.most_common(20)
bg_fdist.most_common(20)
import matplotlib.pyplot as plt
ax = ug_fdist.plot(20)
bg_fdist.plot(20)
No comments:
Post a Comment