How to Extract Text and Text Coordinates from a PDF File

How to extract text and text coordinates from a PDF file?

Full disclosure, I am one of the maintainers of pdfminer.six. It is a community-maintained version of pdfminer for python 3.

Nowadays, pdfminer.six has multiple API's to extract text and information from a PDF. For programmatically extracting information I would advice to use extract_pages(). This allows you to inspect all of the elements on a page, ordered in a meaningful hierarchy created by the layout algorithm.

The following example is a pythonic way of showing all the elements in the hierachy. It uses the simple1.pdf from the samples directory of pdfminer.six.

from pathlib import Path
from typing import Iterable, Any

from pdfminer.high_level import extract_pages

def show_ltitem_hierarchy(o: Any, depth=0):
    """Show location and text of LTItem and all its descendants"""
    if depth == 0:
        print('element                        x1  y1  x2  y2   text')
        print('------------------------------ --- --- --- ---- -----')

    print(
        f'{get_indented_name(o, depth):<30.30s} '
        f'{get_optional_bbox(o)} '
        f'{get_optional_text(o)}'
    )

    if isinstance(o, Iterable):
        for i in o:
            show_ltitem_hierarchy(i, depth=depth + 1)

def get_indented_name(o: Any, depth: int) -> str:
    """Indented name of LTItem"""
    return '  ' * depth + o.__class__.__name__

def get_optional_bbox(o: Any) -> str:
    """Bounding box of LTItem if available, otherwise empty string"""
    if hasattr(o, 'bbox'):
        return ''.join(f'{i:<4.0f}' for i in o.bbox)
    return ''

def get_optional_text(o: Any) -> str:
    """Text of LTItem if available, otherwise empty string"""
    if hasattr(o, 'get_text'):
        return o.get_text().strip()
    return ''

path = Path('~/Downloads/simple1.pdf').expanduser()

pages = extract_pages(path)
show_ltitem_hierarchy(pages)

The output shows the different elements in the hierarchy. The bounding box for each. And the text that this element contains.

element                        x1  y1  x2  y2   text
------------------------------ --- --- --- ---- -----
generator                       
  LTPage                       0   0   612 792  
    LTTextBoxHorizontal        100 695 161 719  Hello
      LTTextLineHorizontal     100 695 161 719  Hello
        LTChar                 100 695 117 719  H
        LTChar                 117 695 131 719  e
        LTChar                 131 695 136 719  l
        LTChar                 136 695 141 719  l
        LTChar                 141 695 155 719  o
        LTChar                 155 695 161 719  
        LTAnno                  
    LTTextBoxHorizontal        261 695 324 719  World
      LTTextLineHorizontal     261 695 324 719  World
        LTChar                 261 695 284 719  W
        LTChar                 284 695 297 719  o
        LTChar                 297 695 305 719  r
        LTChar                 305 695 311 719  l
        LTChar                 311 695 324 719  d
        LTAnno                  
    LTTextBoxHorizontal        100 595 161 619  Hello
      LTTextLineHorizontal     100 595 161 619  Hello
        LTChar                 100 595 117 619  H
        LTChar                 117 595 131 619  e
        LTChar                 131 595 136 619  l
        LTChar                 136 595 141 619  l
        LTChar                 141 595 155 619  o
        LTChar                 155 595 161 619  
        LTAnno                  
    LTTextBoxHorizontal        261 595 324 619  World
      LTTextLineHorizontal     261 595 324 619  World
        LTChar                 261 595 284 619  W
        LTChar                 284 595 297 619  o
        LTChar                 297 595 305 619  r
        LTChar                 305 595 311 619  l
        LTChar                 311 595 324 619  d
        LTAnno                  
    LTTextBoxHorizontal        100 495 211 519  H e l l o
      LTTextLineHorizontal     100 495 211 519  H e l l o
        LTChar                 100 495 117 519  H
        LTAnno                  
        LTChar                 127 495 141 519  e
        LTAnno                  
        LTChar                 151 495 156 519  l
        LTAnno                  
        LTChar                 166 495 171 519  l
        LTAnno                  
        LTChar                 181 495 195 519  o
        LTAnno                  
        LTChar                 205 495 211 519  
        LTAnno                  
    LTTextBoxHorizontal        321 495 424 519  W o r l d
      LTTextLineHorizontal     321 495 424 519  W o r l d
        LTChar                 321 495 344 519  W
        LTAnno                  
        LTChar                 354 495 367 519  o
        LTAnno                  
        LTChar                 377 495 385 519  r
        LTAnno                  
        LTChar                 395 495 401 519  l
        LTAnno                  
        LTChar                 411 495 424 519  d
        LTAnno                  
    LTTextBoxHorizontal        100 395 211 419  H e l l o
      LTTextLineHorizontal     100 395 211 419  H e l l o
        LTChar                 100 395 117 419  H
        LTAnno                  
        LTChar                 127 395 141 419  e
        LTAnno                  
        LTChar                 151 395 156 419  l
        LTAnno                  
        LTChar                 166 395 171 419  l
        LTAnno                  
        LTChar                 181 395 195 419  o
        LTAnno                  
        LTChar                 205 395 211 419  
        LTAnno                  
    LTTextBoxHorizontal        321 395 424 419  W o r l d
      LTTextLineHorizontal     321 395 424 419  W o r l d
        LTChar                 321 395 344 419  W
        LTAnno                  
        LTChar                 354 395 367 419  o
        LTAnno                  
        LTChar                 377 395 385 419  r
        LTAnno                  
        LTChar                 395 395 401 419  l
        LTAnno                  
        LTChar                 410 395 424 419  d
        LTAnno

(Similar answer
here,
here and
here
, I'll try to keep them in sync.)

How can I extract text fragments from PDF with their coordinates in Python?

I've used PyMuPDF to extract page content as a list of single words with bbox information.

import fitz

doc = fitz.open("PDF-export-example.pdf")

for page in doc:
    wlist = page.getTextWords()  # make the word list
    print(wlist)

Output:

[
    (72.0250015258789, 72.119873046875, 114.96617889404297, 106.299560546875, 'Test', 0, 0, 0),
    (120.26901245117188, 72.119873046875, 231.72618103027344, 106.299560546875, 'document', 0, 0, 1),
    (72.0250015258789, 106.21942138671875, 101.52294921875, 120.18524169921875, 'Lorem', 1, 0, 0),
    (103.98699951171875, 106.21942138671875, 132.00445556640625, 120.18524169921875, 'ipsum', 1, 0, 1),
    (134.45799255371094, 106.21942138671875, 159.06637573242188, 120.18524169921875, 'dolor', 1, 0, 2),
    (161.40098571777344, 106.21942138671875, 171.95208740234375, 120.18524169921875, 'sit', 1, 0, 3),
    ...
]

`page.getTextWords()`

method separates a page’s text into “words” using spaces and line
breaks as delimiters. Therefore the words in this lists contain no
spaces or line breaks.
Return type: list

An item of this list looks like this:

(x0, y0, x1, y1, "word", block_no, line_no, word_no)

Where the first 4 items are the float coordinates of the words’s bbox. The last three integers provide some more information on the word’s whereabouts.

A Note on the Name fitz
The standard Python import statement for PyMuPDF library is import fitz. This has a historical reason:

The original rendering library for MuPDF was called Libart.

After Artifex Software acquired the MuPDF project, the development focus shifted on writing a new modern graphics library called Fitz. Fitz was originally intended as an R&D project to replace the aging Ghostscript graphics library, but has instead become the rendering engine powering MuPDF.

How to extract text from PDF and get translation between coordinates in page and string position?

How to reverse PDF Word positions is not easy so lets use a basic example
Sample Image

%PDF-1.4
1 0 obj <</Type /Catalog /Pages 2 0 R>>
endobj
2 0 obj <</Type /Pages /Kids [3 0 R] /Count 1>>
endobj
3 0 obj<</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 500 800] /Contents 6 0 R>>
endobj
4 0 obj<</Font <</F1 5 0 R>>>>
endobj
5 0 obj<</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>
endobj
6 0 obj
<</Length 44>>
stream
BT /F1 24 Tf 175 720 Td (Hello World!)Tj ET
endstream
endobj
xref
0 7
0000000000 65535 f
0000000009 00000 n
0000000056 00000 n
0000000111 00000 n
0000000212 00000 n
0000000250 00000 n
0000000317 00000 n
trailer <</Size 7/Root 1 0 R>>
startxref
406
%%EOF

So here it is very clear the Helvetica string is at x=175 y=720 (i.e. near the top of a default page) but the page is 800 units high not a more natural 842 pt so first problem is what do you mean by co-ordinates and what projection/transformation is at play?

So we can easily say the x value for World! will usually be positive from the left edge, but the origin could be top right of that media thus World! would be negative both in x and y.

For everyday PDF pages we work with default origin for text as bottom left corner of crop box or media box, unless we see stated otherwise. Likewise for images their origin is normally calculated from Top Left corner, but that may be outside their crop boundary. Libraries will help give you relative values, which with luck are simplified to one common origin and scale, with little conflict due to transformation.

In this case we could expect for bottom left of World! x = 236 & y = 80 (800-720).

However a simple HTML conversion may use top:94px;left:263px for the two words together.

<body bgcolor="#A0A0A0" vlink="blue" link="blue">
<div id="page1-div" style="position:relative;width:750px;height:1200px;">
<img width="750" height="1200" src="hello001.png" alt="background image"/>
<p style="position:absolute;top:94px;left:263px;white-space:nowrap" class="ft00">Hello World!</p>
</div>
</body>

Sample Image

If you need a precise position for text then a Printer Trace of components gives the most accurate printout answer

mutool trace hello.pdf

<fill_text colorspace="DeviceGray" color="0" transform="1 0 0 -1 0 800">
    <span font="Helvetica" wmode="0" bidi="0" trm="24 0 0 24">
        <g unicode="H" glyph="H" x="175" y="720" adv=".722"/>
        <g unicode="e" glyph="e" x="192.328" y="720" adv=".556"/>
        <g unicode="l" glyph="l" x="205.672" y="720" adv=".222"/>
        <g unicode="l" glyph="l" x="211" y="720" adv=".222"/>
        <g unicode="o" glyph="o" x="216.328" y="720" adv=".556"/>
        <g unicode=" " glyph="space" x="229.672" y="720" adv=".278"/>
        <g unicode="W" glyph="W" x="236.344" y="720" adv=".944"/>
        <g unicode="o" glyph="o" x="259" y="720" adv=".556"/>
        <g unicode="r" glyph="r" x="272.344" y="720" adv=".333"/>
        <g unicode="l" glyph="l" x="280.336" y="720" adv=".222"/>
        <g unicode="d" glyph="d" x="285.664" y="720" adv=".556"/>
        <g unicode="!" glyph="exclam" x="299.008" y="720" adv=".278"/>
    </span>
</fill_text>

So W of World starts at x="236.344" y="720"

We can also calculate the Width of that Wor[l]d either by adding each advance

Total Advance for World = 2.611 units which transformed by 24 = 62.664 wide = 22.10647 mm

or we can do it by simpler subtraction since we know ! is at 299.008 so -236.344 also = 62.664

Find text position in PDF file

PyMuPDF can find text by coordinates. You can use this in conjunction with the PyPDF2 highlighting method to accomplish what you're describing. Or you can just use PyMuPDF to highlight the text.

Here is sample code for finding text and highlighting with PyMuPDF:

import fitz

### READ IN PDF
doc = fitz.open("input.pdf")

for page in doc:
    ### SEARCH
    text = "Sample text"
    text_instances = page.search_for(text)

    ### HIGHLIGHT
    for inst in text_instances:
        highlight = page.add_highlight_annot(inst)
        highlight.update()

### OUTPUT
doc.save("output.pdf", garbage=4, deflate=True, clean=True)

Python PDF Mining Get Position of Text on Every Line

Found it: The solution is to recurse even when there is a TextBox, until a textline is found. The class below should provide the x and y coordinates of every line of text on a pdf when the parsepdf method is called.

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

class pdfPositionHandling:

    def parse_obj(self, lt_objs):

        # loop over the object list
        for obj in lt_objs:

            if isinstance(obj, pdfminer.layout.LTTextLine):
                print "%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_'))

            # if it's a textbox, also recurse
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                self.parse_obj(obj._objs)

            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                self.parse_obj(obj._objs)

    def parsepdf(self, filename, startpage, endpage):

        # Open a PDF file.
        fp = open(filename, 'rb')

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)

        # Create a PDF document object that stores the document structure.
        # Password for initialization as 2nd parameter
        document = PDFDocument(parser)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)

        # BEGIN LAYOUT ANALYSIS
        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        i = 0
        # loop over all pages in the document
        for page in PDFPage.create_pages(document):
            if i >= startpage and i <= endpage:
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()

                # extract text from this object
                self.parse_obj(layout._objs)
            i += 1

How to Extract Text and Text Coordinates from a PDF File