Recognize PDF Table Using R

Read PDF table into R where rows have varying numbers of lines

I would suggest you to use tabulizer. It is better to extract tables from pdf files. Here the code for the file you shared:

library(tabulizer)
lst <- extract_tables(file = '8-31-2020 Paragraph IV Update_0.pdf')
#Format
renames <- function(x)
{
colnames(x) <- x[1,]
x <- x[2:dim(x)[1],,drop=F]
return(as.data.frame(x))
}
#Apply
lst21 <- lapply(lst,renames)
#Bind all
df <- do.call(rbind,lst21)

Output (some rows):

head(df)

DRUG NAME DOSAGE FORM STRENGTH
1 Abacavir Sulfate Tablets 300 mg
2 Abacavir Oral Solution 20 mg/mL
3 Abacavir Sulfate, Dolutegravir\rand Lamivudine Tablets 600 mg/50 mg/300\rmg
4 Abacavir Sulfate and\rLamivudine Tablets 600 mg/300 mg
5 Abacavir Sulfate, Lamivudine\rand Zidovudine Tablets 300 mg/150 mg/300\rmg
6 Abiraterone Acetate Tablets 125 mg
RLD/NDA DATE OF\rSUBMISSION NUMBER OF\rANDAs\rSUBMITTED 180-DAY\rSTATUS
1 Ziagen\r20977 1/28/2009 1 Eligible
2 Ziagen\r20978 12/27/2012 1 Eligible
3 Triumeq\r205551 8/14/2017 5
4 Epzicom\r21652 9/27/2007 1 Eligible
5 Trizivir\r21205 3/22/2011 1 Eligible
6 Yonsa\r210308 7/23/2018 1
180-DAY\rDECISION\rPOSTING\rDATE DATE OF\rFIRST\rAPPLICANT\rAPPROVAL
1 2/11/2020 6/18/2012
2 2/11/2020 9/26/2016
3
4 2/11/2020 9/29/2016
5 2/11/2020 12/5/2013
6
DATE OF FIRST\rCOMMERCIAL\rMARKETING BY\rFTF EXPIRATION\rDATE OF LAST\rQUALIFYING\rPATENT
1 6/19/2012 5/14/2018
2 9/15/2017 5/14/2018
3 12/8/2029
4 9/29/2016 5/14/2018
5 12/17/2013 5/14/2018
6 3/17/2034

Extracting tables from pdf in R

This is a data prep and wrangling problem, and not a parsing issue in my experience, as the parsing algorithms of tabulizer don't offer much leeway apart from changing between methods, in this case. From what I can see when I try to extract your tables its not only the table of page No. 7 that is incorrectly parsed. Every page is parsed differently but all the data seem to be retained. I can see that your first table has 13 columns, second 17, 3rd 12, 4th 10 and the last three 11 columns. What i would propose to do instead is to parse each page individually and perform data cleaning according to your desired output on each of them and then bind them together. This is a lengthy process and very specific to each table parsed so i will only provide an example script:

library(dplyr)
library(tidyr)
library(tabulizer)
# I create a dummy list to iterate through all the pages and push a data.frame in
result <- list()
for (i in 15:21){
out <- as.data.frame(extract_tables("mydocument.pdf", page = i, method = 'stream'), stringsAsFactors = FALSE)
result[[i]] <- out
}
# Remove excess list items -
# there is probably a better way to do this from within the for loop
result <- result[-(1:14)]

## ------- DATA CLEANING OPERATIONS examples:
# Remove top 3x lines from the first page of table1 not part of data
result[[1]] <- result[[1]][-(1:3),]
# Perform data cleaning operations such as split/ merge columns according to your liking
# for instance if you want to split column X1 into 4 (as in your original post), you can do that by splitting by whitespace
result[[1]] <- separate(result[[1]], 1, into = c('X1.1','X1.2','X1.3', 'X1.4'),sep = ' ', remove = TRUE)

## ---- After data cleaning operations:
# Bind all dataframes (they should have equal number of columns by now into one and make sure the colnames match as well)
df <-bind_rows(result)
# Write your output csv file
write.csv(df, 'yourfilename.csv')

Also you might wanna take a look at the different parsing methods of tabulizer (I have set it at 'stream' here since this by my experience usually yields the best results, but maybe 'lattice' would work better for some of the tables).

How to read table-headers from a PDF-table with R Tidyverse?

To be honest I believe that when it comes to the use of tidyverse, many things are a matter of taste, sure there are best practices, and intended purposes, but the preferences of a developer plays a big role.

Here's for example the main things that I would change, not because they are better, just because I'm more comfortable this way:

col_names <- c("Country", "2010", "2011", "2012", "2013", "2014", "2015", "2016")

to_numeric <- function(x){as.numeric(str_replace(x, pattern = ",", replacement = ""))}
not_factor <- function(x){!is.factor(x)}

animated.plot.beer_production_2010_2016 <-
# remove first row and data from non-EU contries and totals
beer_production_2010_2016.df %>%
#~~~~~~~~~ here are the stuff I changed ~~~~~~~~~
# give the columns the names you want
`names<-`(col_names) %>%
slice(2:29) %>%
# set country as factor
dplyr::mutate(Country = as.factor(Country)) %>%
# change the rest to numerics
dplyr::mutate_if(not_factor, to_numeric) %>%
#~~~~~~~~~~~~~~~~~~~~~ end ~~~~~~~~~~~~~~~~~~~~~~~
# convert from wide to long
tidyr::gather(key = "year", value = "production", "2010":"2016") %>%
# keep the top 15 countries for each year. Add utility-columns with display labels for the plot.
group_by(year) %>%
mutate(rank = rank(-production),
Value_rel = production / production[rank == 1],
Value_lbl = paste0(" ", round(production, digits = 1), " x 1000 hl")) %>%
# group_by(Country) %>% # ~~~~~~~~~~~~~~~~~ are you sure this is necessary?
filter(rank <= 15) %>%
# ungroup() %>% # ~~~~~~~~~~~~~~~~~ are you sure this is necessary?
# create the plot
ggplot(aes(x = rank,
group = Country,
fill = Country,
color = Country)) +
geom_tile(aes(y = production / 2,
height = production,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = 0, label = paste(Country, " ")), vjust = 0.2, hjust = 1) +
geom_text(aes(y = production, label = Value_lbl, hjust = 0)) +
coord_flip(clip = "off", expand = FALSE) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
theme_void() +
theme(legend.position = "none",
panel.grid.major.x = element_line( size = .1, color = "grey" ),
panel.grid.minor.x = element_line( size = .1, color = "grey" ),
plot.title = element_text(size = 25, hjust = 0.5, face = "bold", vjust = -1),
plot.subtitle = element_text(size = 18, hjust = 0.5, face = "italic"),
plot.margin = margin(2,2, 2, 4, "cm")) +
# animate the plot (with dynamic title that includes the year)
gganimate::transition_states(year, transition_length = 4, state_length = 1) +
gganimate::view_follow(fixed_x = TRUE) +
ggplot2::labs(title = 'European beer production per year : {closest_state}',
subtitle = "Top 15 Countries",
caption = "Data Source: The Brewers of Europe")

Note that if you pass to the function tabulizer::extract_tables the parameter output='data.frame', you would get the first row as a header, but you would still have to remove the total rows and the countries you don't want

Scraping a Table from a PDF File

You could use tabulizer::extract_tables(). strsplit "weird-spaced" columns and cbind the snippets.

link <- "my.pdf"

library(tabulizer)
ext <- el(extract_tables(link, encoding="UTF-8"))

res <- cbind(ext[, 1:5], do.call(rbind, strsplit(ext[, 6], " ")),
ext[, 7:12])[, -c(2, 8)]
# store information for dim. names
dim.nm <- list(res[1:20, 1],
c("insg", "uns", "lehrl", "arb",
"ang", "beam", "VB", "sonst",
"pens", "beam.ir"))
# I would divide in a lists here
res <- list(insg=res[1:20, -1],
mann=res[22:41, -1],
frau=res[43:62, -1])
# convert to numbers (using gsub() to get rid of separators)
res <- Map(function(x) apply(x, 2, function(i) as.numeric(gsub("\\D", "", i))), res)
res <- lapply(res, `dimnames<-`, dim.nm)
head(res$insg)
# insg uns lehrl arb ang beam VB sonst pens beam.ir
# 0 b. unter 2 564855 356279 13019 191169 128236 38 14135 9682 208190 386
# 2 b. unter 4 300245 205375 31056 96882 68185 18 7032 2202 94062 808
# 4 b. unter 6 279717 167463 10312 78783 69751 33 6886 1698 111252 1002
# 6 b. unter 8 247614 140412 22926 62535 47390 88 6554 919 105818 1384
# 8 b. unter 10 268805 144298 21682 66518 48945 140 6278 735 123181 1326
# 10 b. unter 12 393303 144576 18387 65387 51705 177 8500 420 245470 3257

Scraping PDF tables with empty Cells

Try using read.fwf as a fixed-width file.

data <- pdf_file[1]
data <- trimws(data)
data <- strsplit(data, "\r\n")
data <- data[[1]]
writeLines(data, 'temp.txt')
result <- read.fwf('temp.txt', c(11, 2, rep(8, 8)), skip = 1, strip.white = TRUE)
names(result) <- scan(text = readLines('temp.txt', n = 1), what = character())
result

# rows col1 col2 col3 col4 col5 col6 col7 col8 col9
#1 row1 1 1 1 1 1 1 1 1 1
#2 row2 2 2 2 2 2 2 2 NA 2
#3 row3 NA 3 NA 3 3 NA 3 3 3
#4 row4 4 4 4 4 4 NA 4 4 4
#5 row5 5 NA 5 5 5 5 5 5 5
#6 row6 6 NA 6 6 6 6 NA 6 6
#7 row7 7 7 7 7 7 7 7 7 7
#8 row8 8 8 8 NA 8 NA 8 NA 8
#9 row9 9 9 9 9 9 9 9 9 9
#10 row10 10 NA 10 10 10 10 10 10 10


Related Topics



Leave a reply



Submit