Friday, July 2, 2021

【PYTHON】PDF files reading library

 from glob import glob

word_files = glob('*.docx')

word_files


pip install textract


import textract


text = textract.process(word_files[0])

print(type(text))

print(text[:10])

text = text.decode('utf-8')

print(type(text))

text[:200]


from bs4 import UnicodeDammit

with open(word_files[0], 'rb') as f:

  blob = f.read()

  suggestion = UnicodeDammit(blob)

  print(suggestion.original_encoding)


import string


translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

text[:100]


import nltk

nltk.download('stopwords')


from nltk.corpus import stopwords


en_stopwords = stopwords.words('english')


print(en_stopwords[:10])


en_stopwords = set(en_stopwords)


words = text.lower().split()


words[:10]


words = [w for w in words if w not in en_stopwords and len(w) > 3]


words[:10]


bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigrams[:3]


ug_fdist = nltk.FreqDist(words)

bg_fdist = nltk.FreqDist(bigrams)


ug_fdist.most_common(20)


import matplotlib.pyplot as plt


ax = ug_fdist.plot(20)


bg_fdist.plot(20)


from wordcloud import WordCloud

plt.figure(figsize=(5.5, 5.5))

wordcloud = WordCloud(collocations=False, height=300, width=300, scale=3).generate(' '.join(words))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")

plt.savefig('B17030_06_02.png', dpi=300)

plt.show()


import os

from glob import glob

import textract

import nltk


en_stopwords = set(nltk.corpus.stopwords.words('english'))



def create_fdist_visualizations_doc(path):

  """

  Takes a path to a folder with .docx files, reads and cleans text,

  then plots unigram and bigram frequency distributions.

  """

  word_docs = glob(os.path.join(path, '*.docx'))

  text = ' '.join([textract.process(w).decode('utf-8') for w in word_docs])


  # remove punctuation, numbers, stopwords

  translator = str.maketrans('', '', string.punctuation + string.digits)

  text = text.translate(translator)

  words = text.lower().split()

  words = [w for w in words if w not in en_stopwords and len(w) > 3]


  unigram_fd = nltk.FreqDist(words)

  bigrams = bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

  bigram_fd = nltk.FreqDist(bigrams)


  unigram_fd.plot(20)

  bigram_fd.plot(20)


create_fdist_visualizations_doc(r'')


!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \

!flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig

doc_text = textract.process(r'gfsa03-04rpt.doc')

doc_text = doc_text.decode('utf-8')

doc_text[:100]


with open(r'gfsa03-04rpt.doc', 'rb') as f:

  blob = f.read()

  suggestion = UnicodeDammit(blob)


print(suggestion.original_encoding)


import docx

doc = docx.Document(word_files[0])


text = ' '.join([p.text for p in doc.paragraphs])


words[:10]


pip install python-docx


pdf_files = glob('*.pdf')

pdf_files


from bs4 import UnicodeDammit

#Again, we can check the encoding of the file if we want.

with open(pdf_files[0], 'rb') as f:

  blob = f.read()

  suggestion = UnicodeDammit(blob)


print(suggestion.original_encoding)


pip install PyMuPDF


import fitz

with fitz.open(pdf_files[0]) as doc:

  text = ""

  for page in doc:

    text += page.getText()


print(text[:50])


lines = text.split('\n')

cleaned_lines = []

for l in lines:

  if len(l) == 0:

    continue

  if l[-1] == '-':

    cleaned_lines.append(l[:-1])

  else:

    cleaned_lines.append(l + ' ')


cleaned = ''.join(cleaned_lines)

cleaned[:200]





import string


def extract(w):

  with fitz.open(w) as doc:

      text = ""

      for page in doc:

        text += page.getText()

  return text


def  create_fdist_visualizations(path):

  """

  Takes a path to a folder with .docx files, reads and cleans text,

  then plots unigram and bigram frequency distributions.

  """

  extension = "pdf"

  docs = glob(os.path.join(path, f'*.{extension}'))

  if extension in['doc', 'docx']:

    text = ' '.join(textract.process(w).decode('utf-8') for w in docs)

  elif extension == 'pdf':

    

    text = ' '.join(extract(w) for w in docs)

    lines = text.split('\n')

    cleaned_lines = []

    for l in lines:

      if len(l) == 0:

        continue

      if l[-1] == '-':

        cleaned_lines.append(l[:-1])

      else:

        cleaned_lines.append(l + ' ')


    text = ''.join(cleaned_lines)


  # remove punctuation, numbers, stopwords

  translator = str.maketrans('', '', string.punctuation + string.digits)

  text = text.translate(translator)

  words = text.lower().split()

  words = [w for w in words if w not in en_stopwords and len(w) > 3]


  unigram_fd = nltk.FreqDist(words)

  bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

  bigram_fd = nltk.FreqDist(bigrams)


  unigram_fd.plot(20)

  bigram_fd.plot(20)




create_fdist_visualizations('')


pip install tika



from tika import parser

text = parser.from_file(pdf_files[0])

# returns a dictionary

print(text.keys())

print(text['content'].strip()[:200])

No comments:

Post a Comment

End of Summer Sale ☀️😎

20% OFF Inside!🤯 ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏  ͏...