Extracting Data from HTML Table

Extracting data from HTML table

A Python solution using BeautifulSoup4 (Edit: with proper skipping. Edit3: Using class="details" to select the table):

from bs4 import BeautifulSoup

html = """
<table class="details" border="0" cellpadding="5" cellspacing="2" width="95%">
<tr valign="top">
<th>Tests</th>
<th>Failures</th>
<th>Success Rate</th>
<th>Average Time</th>
<th>Min Time</th>
<th>Max Time</th>
</tr>
<tr valign="top" class="Failure">
<td>103</td>
<td>24</td>
<td>76.70%</td>
<td>71 ms</td>
<td>0 ms</td>
<td>829 ms</td>
</tr>
</table>"""

soup = BeautifulSoup(html)
table = soup.find("table", attrs={"class":"details"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
datasets.append(dataset)

print datasets

The result looks like this:

[[(u'Tests', u'103'),
(u'Failures', u'24'),
(u'Success Rate', u'76.70%'),
(u'Average Time', u'71 ms'),
(u'Min Time', u'0 ms'),
(u'Max Time', u'829 ms')]]

Edit2: To produce the desired output, use something like this:

for dataset in datasets:
for field in dataset:
print "{0:<16}: {1}".format(field[0], field[1])

Result:

Tests           : 103
Failures : 24
Success Rate : 76.70%
Average Time : 71 ms
Min Time : 0 ms
Max Time : 829 ms

Extract Table Information from HTML (As Text File)

Take HTML data from the file and export a separate csv.

import csv
from simplified_scrapy import SimplifiedDoc,req,utils
name = 'test.html'
html = utils.getFileContent(name) # Get data from file
doc = SimplifiedDoc(html)
rows = []
tables = doc.selects('table.region-table')
for table in tables:
trs = table.tbody.trs
for tr in trs:
rows.append([td.text for td in tr.tds])
with open(name+'.csv','w',encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(rows)

If you want to keep one file per table

doc = SimplifiedDoc(html)
i=0
tables = doc.selects('table.region-table')
for table in tables:
i+=1
rows = []
trs = table.tbody.trs
for tr in trs:
rows.append([td.text for td in tr.tds])
with open(name+str(i)+'.csv','w',encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(rows)

Keep the original one for comparison.

import csv
from simplified_scrapy import SimplifiedDoc,req
html = '''''' # Your HTML
doc = SimplifiedDoc(html)
rows = []
tables = doc.selects('table.region-table')
for table in tables:
trs = table.tbody.trs
for tr in trs:
rows.append([td.text for td in tr.tds])
# If you have '>Region.*?</a>' in each row, you can get all the rows directly in the following way
# trs = doc.getElementsByReg('>Region.*?</a>',tag='tr')
# for tr in trs:
# rows.append([td.text for td in tr.tds])
with open('test.csv','w',encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(rows)

Result:

Region 1.1,NRPS-like,"21,469","62,957",phthoxazolin,NRP + Polyketide,4%
Region 1.2,NRPS,"74,163","124,963",nystatin,Polyketide,10%
Region 2.1,terpene,"3,800","23,263",ebelactone,Polyketide,5%
Region 2.2,NRPS-like,"55,320","97,088",indigoidine,Saccharide,17%
Region 2.3,NRPS,"144,740","193,599",streptobactin,NRP,70%
Region 2.4,siderophore,"347,862","362,833",ficellomycin,NRP,3%
Region 2.5,lassopeptide,"548,017","570,561",ikarugamycin,NRP + Polyketide:Iterative type I,12%
Region 2.6,NRPS,"628,834","683,050",himastatin,NRP,12%
Region 2.7,"NRPS,terpene","1,043,511","1,104,786",nargenicin,Polyketide,11%

Extracting data from a table using javascript

Surely you can do this via JavaScript, although this is a rather broad question. You have to get the table element you need to manipulate (say, if it has and id someId, you have to use var table = document.getElementById('someId');) and then either manipulate its table.innerHTML (probably a good starting point) or its children using DOM API: say, for this page ("Parameter Values" table) you may try in browser console:

// first table with the "w3-table-all notranslate" class
var table = document.getElementsByClassName("w3-table-all notranslate")[0];

table
.children[0] // will get the tbody element
.children[1] // will get the second row from the tbody element
.children[0] // will get the first (colomn) cell in the second row
.innerHTML; // will show you html contents of the cell in the console

// change the cell contents
table.children[0].children[1].children[0].innerHTML = "<b>I've changed this stuff!</b>";

// you may also want to remember rows/cells:
var row = table.children[0].children[1];
var cell = row.children[0];

And basically that's it.

extracting data from an html table in p rather than table

I'm using BeautifulSoup for parse the request html each tag p and br , the final result is a dataframe...later you can export it on a excel file...I hope that can help you

from bs4 import BeautifulSoup
import requests
import pandas as pd

result = requests.get('http://www.linfo.org/acronym_list.html')
c = result.content
soup = BeautifulSoup(c, "html.parser")
samples = soup.find_all("p")

rows_list = []

for row in samples:
tagstrong = row.find_all("strong")
for x in tagstrong:
#print(x.get_text())
tagbr = row.find_all("br")
for y in tagbr:
new_row = {'letter':x.get_text(), 'content':y.next}
rows_list.append(new_row)

df1 = pd.DataFrame(rows_list)
print(df1.head(10))

this is the result :

Sample Image

Extracting HTML table data from email to csv file, 1st column values to row headers, using Python

Try this:

import win32com.client
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint

outlook = win32com.client.Dispatch("Outlook.Application")
mapi = outlook.GetNamespace("MAPI")

inbox = mapi.Folders['User@email.com'].Folders['Inbox'].Folders['Subfolder Name']
Mail_Messages = inbox.Items

# a list where contents of each e-mail - CC, receiv.time and subject will have been put
contents = []
column_names = ['Report Name', 'Team Name', 'Project Name', 'Unique ID Number', 'Due Date', 'ReceivedTime', 'CC', 'Subject']

for mail in Mail_Messages:

body = mail.HTMLBody
html_body = BeautifulSoup(body, "lxml")
html_tables = html_body.find_all('table')

# uncomment following lines if you want to have column names defined programatically rather than hardcoded
# column_names = pd.read_html(str(html_tables), header=None)[0][0]
# column_names = column_names.tolist()
# column_names.append("CC")
# column_names.append("Received Time")
# column_names.append("Subject")

# a list containing a single e-mail data - html table, CC, receivedTime and subject
row = pd.read_html(str(html_tables), header=None)[0][1]
row = row.tolist()
row.append(mail.CC)
row.append(mail.ReceivedTime.strftime('%Y-%m-%d %H:%M:%S'))
row.append(mail.Subject)

# appending each full row to a list
contents.append(row)

# and finally converting a list into dataframe
df = pd.DataFrame(contents, columns=column_names)

pprint(df)

Extract all data from a dynamic HTML table

After a lot of testing, here is the answer :

 try:
last_row = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]")
last_row_old = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
last_row.click()
last_row.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
last_row_new = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text

while (last_row_new == last_row_old) is False:
table = driver.find_element_by_xpath("//*[contains(@id, '--TilesTable-table')]/tbody")
td_list = table.find_elements_by_xpath(".//tr/*[contains(@id, '-col1')]")
for td in td_list:
tile_title = td.text
sh_tile = wb["Tuiles"]
sh_tile.append([catalog, tile_title])
last_row = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]")
last_row_old = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
last_row.click()
last_row.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
last_row_new = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
except selenium.common.exceptions.NoSuchElementException:
pass

extracting data from an HTML table using BeautifulSoup

Try this..

url = 'https://eresearch.fidelity.com/eresearch/markets_sectors/si_performance.jhtml'

industry = {'tab': 'industry'}
sector = {'tab': 'sector'}

r = requests.post(url, data=industry)
#soup = BeautifulSoup(response.content, 'html.parser')

#sectors = soup.find("table", id="perfTableSort")
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()

Now you can put data=industry or data=sector to get desired result..



Related Topics



Leave a reply



Submit