from glob import glob
word_files = glob('*.docx')
word_files
pip install textract
import textract
text = textract.process(word_files[0])
print(type(text))
print(text[:10])
text = text.decode('utf-8')
print(type(text))
text[:200]
from bs4 import UnicodeDammit
with open(word_files[0], 'rb') as f:
blob = f.read()
suggestion = UnicodeDammit(blob)
print(suggestion.original_encoding)
import string
translator = str.maketrans('', '', string.punctuation + string.digits)
text = text.translate(translator)
text[:100]
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')
print(en_stopwords[:10])
en_stopwords = set(en_stopwords)
words = text.lower().split()
words[:10]
words = [w for w in words if w not in en_stopwords and len(w) > 3]
words[:10]
bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])
bigrams[:3]
ug_fdist = nltk.FreqDist(words)
bg_fdist = nltk.FreqDist(bigrams)
ug_fdist.most_common(20)
import matplotlib.pyplot as plt
ax = ug_fdist.plot(20)
bg_fdist.plot(20)
from wordcloud import WordCloud
plt.figure(figsize=(5.5, 5.5))
wordcloud = WordCloud(collocations=False, height=300, width=300, scale=3).generate(' '.join(words))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('B17030_06_02.png', dpi=300)
plt.show()
import os
from glob import glob
import textract
import nltk
en_stopwords = set(nltk.corpus.stopwords.words('english'))
def create_fdist_visualizations_doc(path):
"""
Takes a path to a folder with .docx files, reads and cleans text,
then plots unigram and bigram frequency distributions.
"""
word_docs = glob(os.path.join(path, '*.docx'))
text = ' '.join([textract.process(w).decode('utf-8') for w in word_docs])
# remove punctuation, numbers, stopwords
translator = str.maketrans('', '', string.punctuation + string.digits)
text = text.translate(translator)
words = text.lower().split()
words = [w for w in words if w not in en_stopwords and len(w) > 3]
unigram_fd = nltk.FreqDist(words)
bigrams = bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])
bigram_fd = nltk.FreqDist(bigrams)
unigram_fd.plot(20)
bigram_fd.plot(20)
create_fdist_visualizations_doc(r'')
!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
!flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
doc_text = textract.process(r'gfsa03-04rpt.doc')
doc_text = doc_text.decode('utf-8')
doc_text[:100]
with open(r'gfsa03-04rpt.doc', 'rb') as f:
blob = f.read()
suggestion = UnicodeDammit(blob)
print(suggestion.original_encoding)
import docx
doc = docx.Document(word_files[0])
text = ' '.join([p.text for p in doc.paragraphs])
words[:10]
pip install python-docx
pdf_files = glob('*.pdf')
pdf_files
from bs4 import UnicodeDammit
#Again, we can check the encoding of the file if we want.
with open(pdf_files[0], 'rb') as f:
blob = f.read()
suggestion = UnicodeDammit(blob)
print(suggestion.original_encoding)
pip install PyMuPDF
import fitz
with fitz.open(pdf_files[0]) as doc:
text = ""
for page in doc:
text += page.getText()
print(text[:50])
lines = text.split('\n')
cleaned_lines = []
for l in lines:
if len(l) == 0:
continue
if l[-1] == '-':
cleaned_lines.append(l[:-1])
else:
cleaned_lines.append(l + ' ')
cleaned = ''.join(cleaned_lines)
cleaned[:200]
import string
def extract(w):
with fitz.open(w) as doc:
text = ""
for page in doc:
text += page.getText()
return text
def create_fdist_visualizations(path):
"""
Takes a path to a folder with .docx files, reads and cleans text,
then plots unigram and bigram frequency distributions.
"""
extension = "pdf"
docs = glob(os.path.join(path, f'*.{extension}'))
if extension in['doc', 'docx']:
text = ' '.join(textract.process(w).decode('utf-8') for w in docs)
elif extension == 'pdf':
text = ' '.join(extract(w) for w in docs)
lines = text.split('\n')
cleaned_lines = []
for l in lines:
if len(l) == 0:
continue
if l[-1] == '-':
cleaned_lines.append(l[:-1])
else:
cleaned_lines.append(l + ' ')
text = ''.join(cleaned_lines)
# remove punctuation, numbers, stopwords
translator = str.maketrans('', '', string.punctuation + string.digits)
text = text.translate(translator)
words = text.lower().split()
words = [w for w in words if w not in en_stopwords and len(w) > 3]
unigram_fd = nltk.FreqDist(words)
bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])
bigram_fd = nltk.FreqDist(bigrams)
unigram_fd.plot(20)
bigram_fd.plot(20)
create_fdist_visualizations('')
pip install tika
from tika import parser
text = parser.from_file(pdf_files[0])
# returns a dictionary
print(text.keys())
print(text['content'].strip()[:200])
No comments:
Post a Comment