Wednesday, June 30, 2021

【PYTHON】Remove punctuation and numbers and most common words

 #### Remove punctuation and numbers.


import string


translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

text[:100]


import nltk

nltk.download('stopwords')


from nltk.corpus import stopwords


en_stopwords = stopwords.words('english')


print(en_stopwords[:10])


en_stopwords = set(en_stopwords)


words = text.lower().split()


words[:10]


words = [w for w in words if w not in en_stopwords and len(w) > 3]


words[:10]


bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigrams[:3]


ug_fdist = nltk.FreqDist(words)

bg_fdist = nltk.FreqDist(bigrams)


ug_fdist.most_common(20)


bg_fdist.most_common(20)


import matplotlib.pyplot as plt


ax = ug_fdist.plot(20)


bg_fdist.plot(20)

No comments:

Post a Comment

End of Summer Sale ☀️😎

20% OFF Inside!🤯 ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏...