Magnificent News: 【PYTHON】PDF files reading library

Friday, July 2, 2021

【PYTHON】PDF files reading library

from glob import glob

word_files = glob('*.docx')

word_files

pip install textract

import textract

text = textract.process(word_files[0])

print(type(text))

print(text[:10])

text = text.decode('utf-8')

print(type(text))

text[:200]

from bs4 import UnicodeDammit

with open(word_files[0], 'rb') as f:

blob = f.read()

suggestion = UnicodeDammit(blob)

print(suggestion.original_encoding)

import string

translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

text[:100]

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

print(en_stopwords[:10])

en_stopwords = set(en_stopwords)

words = text.lower().split()

words[:10]

words = [w for w in words if w not in en_stopwords and len(w) > 3]

words[:10]

bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigrams[:3]

ug_fdist = nltk.FreqDist(words)

bg_fdist = nltk.FreqDist(bigrams)

ug_fdist.most_common(20)

import matplotlib.pyplot as plt

ax = ug_fdist.plot(20)

bg_fdist.plot(20)

from wordcloud import WordCloud

plt.figure(figsize=(5.5, 5.5))

wordcloud = WordCloud(collocations=False, height=300, width=300, scale=3).generate(' '.join(words))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")

plt.savefig('B17030_06_02.png', dpi=300)

plt.show()

import os

from glob import glob

import textract

import nltk

en_stopwords = set(nltk.corpus.stopwords.words('english'))

def create_fdist_visualizations_doc(path):

"""

Takes a path to a folder with .docx files, reads and cleans text,

then plots unigram and bigram frequency distributions.

"""

word_docs = glob(os.path.join(path, '*.docx'))

text = ' '.join([textract.process(w).decode('utf-8') for w in word_docs])

# remove punctuation, numbers, stopwords

translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

words = text.lower().split()

words = [w for w in words if w not in en_stopwords and len(w) > 3]

unigram_fd = nltk.FreqDist(words)

bigrams = bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigram_fd = nltk.FreqDist(bigrams)

unigram_fd.plot(20)

bigram_fd.plot(20)

create_fdist_visualizations_doc(r'')

!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \

!flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig

doc_text = textract.process(r'gfsa03-04rpt.doc')

doc_text = doc_text.decode('utf-8')

doc_text[:100]

with open(r'gfsa03-04rpt.doc', 'rb') as f:

blob = f.read()

suggestion = UnicodeDammit(blob)

print(suggestion.original_encoding)

import docx

doc = docx.Document(word_files[0])

text = ' '.join([p.text for p in doc.paragraphs])

words[:10]

pip install python-docx

pdf_files = glob('*.pdf')

pdf_files

from bs4 import UnicodeDammit

#Again, we can check the encoding of the file if we want.

with open(pdf_files[0], 'rb') as f:

blob = f.read()

suggestion = UnicodeDammit(blob)

print(suggestion.original_encoding)

pip install PyMuPDF

import fitz

with fitz.open(pdf_files[0]) as doc:

text = ""

for page in doc:

text += page.getText()

print(text[:50])

lines = text.split('\n')

cleaned_lines = []

for l in lines:

if len(l) == 0:

continue

if l[-1] == '-':

cleaned_lines.append(l[:-1])

else:

cleaned_lines.append(l + ' ')

cleaned = ''.join(cleaned_lines)

cleaned[:200]

import string

def extract(w):

with fitz.open(w) as doc:

text = ""

for page in doc:

text += page.getText()

return text

def create_fdist_visualizations(path):

"""

Takes a path to a folder with .docx files, reads and cleans text,

then plots unigram and bigram frequency distributions.

"""

extension = "pdf"

docs = glob(os.path.join(path, f'*.{extension}'))

if extension in['doc', 'docx']:

text = ' '.join(textract.process(w).decode('utf-8') for w in docs)

elif extension == 'pdf':

text = ' '.join(extract(w) for w in docs)

lines = text.split('\n')

cleaned_lines = []

for l in lines:

if len(l) == 0:

continue

if l[-1] == '-':

cleaned_lines.append(l[:-1])

else:

cleaned_lines.append(l + ' ')

text = ''.join(cleaned_lines)

# remove punctuation, numbers, stopwords

translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

words = text.lower().split()

words = [w for w in words if w not in en_stopwords and len(w) > 3]

unigram_fd = nltk.FreqDist(words)

bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigram_fd = nltk.FreqDist(bigrams)

unigram_fd.plot(20)

bigram_fd.plot(20)

create_fdist_visualizations('')

pip install tika

from tika import parser

text = parser.from_file(pdf_files[0])

# returns a dictionary

print(text.keys())

print(text['content'].strip()[:200])

Magnificent News

Friday, July 2, 2021

【PYTHON】PDF files reading library

No comments:

Post a Comment

End of Summer Sale ☀️😎

Report Abuse

Labels