How to extract text from pdf in Python 3.7
Using tika worked for me!
from tika import parser
rawText = parser.from_file('January2019.pdf')
rawList = rawText['content'].splitlines()
This made it really easy to extract separate each line in the bank statement into a list.
Extract text from pdf file using javascript
here is a nice example of how to use pdf.js for extracting the text:
http://git.macropus.org/2011/11/pdftotext/example/
of course you have to remove a lot of code for your purpose, but it should do it
How to extract text from multiple pdf in a location with specific line and store in Excel?
Tika is one of the Python packages that you can use to extract the data from your PDF files.
In the example below I'm using Tika
and regular expressions to extract these five data elements:
- bid no
- end date
- item category
- organisation name
- total quantity
import re as regex
from tika import parser
parse_entire_pdf = parser.from_file('2022251527199.pdf', xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
print(bid_number.group(2))
GEM/2022/B/1916455
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
print(bid_end_date.group(2))
21-02-2022 15:00:00
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
print(org_name.group(2))
State Election Commission (sec), Gujarat
item_category = regex.search(r'(Item Category)\W(.*)', values)
print(item_category.group(2))
Desktop Computers (Q2) , Computer Printers (Q2)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
print(total_quantity.group(2))
18
Here is one way to write out the extracted data to a CSV file:
import csv
import re as regex
from tika import parser
document_elements = []
# processing 2 documents
documents = ['202225114747453.pdf', '2022251527199.pdf']
for doc in documents:
parse_entire_pdf = parser.from_file(doc, xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
Here is the additional code that you asked for in the comments.
import os
import re as regex
from tika import parser
document_elements = []
image_directory = "pdf_files"
image_directory_abspath = os.path.abspath(image_directory)
for dirpath, dirnames, filenames in os.walk(image_directory_abspath):
for filename in [f for f in filenames if f.endswith(".pdf")]:
parse_entire_pdf = parser.from_file(os.path.join(dirpath, filename), xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
SPECIAL NOTE: I noted that some PDFs don't have an org_name, so you will have to figure out how to handle these with either a N/A, None, or Null
How extract text from this compressed PDF/A?
If you want to decompress the streams in a PDF file, I can recommend using qdpf
, but on this file
qpdf --decrypt --stream-data=uncompress document.pdf out.pdf
doesn't help either.
I am not sure though why your efforts with xpdf
and tesseract
did not work out, using image-magick's convert
to create PNG files in a temporary directory and tesseract
, you can do:
import os
from pathlib import Path
from tempfile import TemporaryDirectory
import subprocess
DPI=600
def call(*args):
cmd = [str(x) for x in args]
return subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8')
def ocr(docpath, lang):
result = []
abs_path = Path(docpath).expanduser().resolve()
old_dir = os.getcwd()
out = Path('out.txt')
with TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
call('convert', '-density', DPI, abs_path, 'out.png')
index = -1
while True:
# names have no leading zeros on the digits, would be difficult to sort glob() output
# so just count them
index += 1
png = Path(f'out-{index}.png')
if not png.exists():
break
call('tesseract', '--dpi', DPI, png, out.stem, '-l', lang)
result.append(out.read_text())
os.chdir(old_dir)
return result
pages = ocr('~/Downloads/document.pdf', 'por')
print('\n'.join(pages[1].splitlines()[21:24]))
which gives:
DA NÃO REALIZAÇÃO DE AUDIÊNCIA DE AUTOCOMPOSIÇÃO NO CASO EM CONCRETO
Com vista a obter maior celeridade processual, assim como da impossibilidade de conciliação entre
If you are on Windows, make sure your PDF file is not open in a different process (like a PDF viewer), as Windows doesn't seem to like that.
The final print
is limited as the full output is quite large.
This converting and OCR-ing takes a while so you might want to uncomment the print
in call()
to get some sense of progress.
Related Topics
How to Use PHPexcel to Read Data and Insert into Database
How to Set the Default Value of a Timestamp Column to the Current Timestamp With Laravel Migrations
How to Install Composer PHP Packages Without Composer
How to Redirect Domain According to Country Ip Address
Download Files in Laravel Using Response::Download
Generating Cryptographically Secure Tokens
How to Validate Google Recaptcha V3 on Server Side
Call to Undefined Function Oci_Connect()
PHP 5 Strpos() Difference Between Returning 0 and False
Return Index of Highest Value in an Array
How to Check If a Word Is Contained in Another String Using PHP