Extracting Data from HTML Table

Extracting data from HTML table

A Python solution using BeautifulSoup4 (Edit: with proper skipping. Edit3: Using class="details" to select the table):

from bs4 import BeautifulSoup

html = """
  <table class="details" border="0" cellpadding="5" cellspacing="2" width="95%">
    <tr valign="top">
      <th>Tests</th>
      <th>Failures</th>
      <th>Success Rate</th>
      <th>Average Time</th>
      <th>Min Time</th>
      <th>Max Time</th>
   </tr>
   <tr valign="top" class="Failure">
     <td>103</td>
     <td>24</td>
     <td>76.70%</td>
     <td>71 ms</td>
     <td>0 ms</td>
     <td>829 ms</td>
  </tr>
</table>"""

soup = BeautifulSoup(html)
table = soup.find("table", attrs={"class":"details"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
    dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
    datasets.append(dataset)

print datasets

The result looks like this:

[[(u'Tests', u'103'),
  (u'Failures', u'24'),
  (u'Success Rate', u'76.70%'),
  (u'Average Time', u'71 ms'),
  (u'Min Time', u'0 ms'),
  (u'Max Time', u'829 ms')]]

Edit2: To produce the desired output, use something like this:

for dataset in datasets:
    for field in dataset:
        print "{0:<16}: {1}".format(field[0], field[1])

Result:

Tests           : 103
Failures        : 24
Success Rate    : 76.70%
Average Time    : 71 ms
Min Time        : 0 ms
Max Time        : 829 ms

Extract Table Information from HTML (As Text File)

Take HTML data from the file and export a separate csv.

import csv
from simplified_scrapy import SimplifiedDoc,req,utils
name = 'test.html'
html = utils.getFileContent(name) # Get data from file
doc = SimplifiedDoc(html)
rows = []
tables = doc.selects('table.region-table')
for table in tables:
    trs = table.tbody.trs
    for tr in trs:
        rows.append([td.text for td in tr.tds])
with open(name+'.csv','w',encoding='utf-8') as f: 
    csv_writer = csv.writer(f)
    csv_writer.writerows(rows)

If you want to keep one file per table

doc = SimplifiedDoc(html)
i=0
tables = doc.selects('table.region-table')
for table in tables:
    i+=1
    rows = []
    trs = table.tbody.trs
    for tr in trs:
        rows.append([td.text for td in tr.tds])
    with open(name+str(i)+'.csv','w',encoding='utf-8') as f: 
        csv_writer = csv.writer(f)
        csv_writer.writerows(rows)

Keep the original one for comparison.

import csv
from simplified_scrapy import SimplifiedDoc,req
html = '''''' # Your HTML
doc = SimplifiedDoc(html)
rows = []
tables = doc.selects('table.region-table')
for table in tables:
    trs = table.tbody.trs
    for tr in trs:
        rows.append([td.text for td in tr.tds])
 # If you have '>Region.*?</a>' in each row, you can get all the rows directly in the following way
 # trs = doc.getElementsByReg('>Region.*?</a>',tag='tr')
 # for tr in trs:
    # rows.append([td.text for td in tr.tds])
with open('test.csv','w',encoding='utf-8') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(rows)

Result:

Region 1.1,NRPS-like,"21,469","62,957",phthoxazolin,NRP + Polyketide,4%
Region 1.2,NRPS,"74,163","124,963",nystatin,Polyketide,10%
Region 2.1,terpene,"3,800","23,263",ebelactone,Polyketide,5%
Region 2.2,NRPS-like,"55,320","97,088",indigoidine,Saccharide,17%
Region 2.3,NRPS,"144,740","193,599",streptobactin,NRP,70%
Region 2.4,siderophore,"347,862","362,833",ficellomycin,NRP,3%
Region 2.5,lassopeptide,"548,017","570,561",ikarugamycin,NRP + Polyketide:Iterative type I,12%
Region 2.6,NRPS,"628,834","683,050",himastatin,NRP,12%
Region 2.7,"NRPS,terpene","1,043,511","1,104,786",nargenicin,Polyketide,11%

Extracting data from a table using javascript

Surely you can do this via JavaScript, although this is a rather broad question. You have to get the table element you need to manipulate (say, if it has and id someId, you have to use var table = document.getElementById('someId');) and then either manipulate its table.innerHTML (probably a good starting point) or its children using DOM API: say, for this page ("Parameter Values" table) you may try in browser console:

// first table with the "w3-table-all notranslate" class
var table = document.getElementsByClassName("w3-table-all notranslate")[0];

table
    .children[0] // will get the tbody element
    .children[1] // will get the second row from the tbody element
    .children[0] // will get the first (colomn) cell in the second row
    .innerHTML;  // will show you html contents of the cell in the console

// change the cell contents
table.children[0].children[1].children[0].innerHTML = "<b>I've changed this stuff!</b>";

// you may also want to remember rows/cells:
var row = table.children[0].children[1];
var cell = row.children[0];

And basically that's it.

extracting data from an html table in p rather than table

I'm using BeautifulSoup for parse the request html each tag p and br , the final result is a dataframe...later you can export it on a excel file...I hope that can help you

from bs4 import BeautifulSoup
import requests
import pandas as pd

result = requests.get('http://www.linfo.org/acronym_list.html')
c = result.content
soup = BeautifulSoup(c, "html.parser")
samples = soup.find_all("p")

rows_list = []

for row in samples:
    tagstrong = row.find_all("strong")
    for x in tagstrong:
        #print(x.get_text())
        tagbr = row.find_all("br")
        for y in tagbr:
            new_row = {'letter':x.get_text(), 'content':y.next}
            rows_list.append(new_row)

df1 = pd.DataFrame(rows_list)
print(df1.head(10))

this is the result :

Sample Image

Extracting HTML table data from email to csv file, 1st column values to row headers, using Python

Try this:

import win32com.client
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint

outlook = win32com.client.Dispatch("Outlook.Application")
mapi = outlook.GetNamespace("MAPI")

inbox = mapi.Folders['User@email.com'].Folders['Inbox'].Folders['Subfolder Name']
Mail_Messages = inbox.Items

# a list where contents of each e-mail - CC, receiv.time and subject will have been put
contents = []
column_names = ['Report Name', 'Team Name', 'Project Name', 'Unique ID Number', 'Due Date', 'ReceivedTime', 'CC', 'Subject']

for mail in Mail_Messages:

    body = mail.HTMLBody
    html_body = BeautifulSoup(body, "lxml")
    html_tables = html_body.find_all('table')

    # uncomment following lines if you want to have column names defined programatically rather than hardcoded
    # column_names = pd.read_html(str(html_tables), header=None)[0][0]
    # column_names = column_names.tolist()
    # column_names.append("CC")
    # column_names.append("Received Time")
    # column_names.append("Subject")

    # a list containing a single e-mail data - html table, CC, receivedTime and subject
    row = pd.read_html(str(html_tables), header=None)[0][1]
    row = row.tolist()
    row.append(mail.CC)
    row.append(mail.ReceivedTime.strftime('%Y-%m-%d %H:%M:%S'))
    row.append(mail.Subject)

    # appending each full row to a list
    contents.append(row)

# and finally converting a list into dataframe
df = pd.DataFrame(contents, columns=column_names)

pprint(df)

Extract all data from a dynamic HTML table

After a lot of testing, here is the answer :

 try:
        last_row = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]")
        last_row_old = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
        last_row.click()
        last_row.send_keys(Keys.PAGE_DOWN)
        time.sleep(2)
        last_row_new = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text

        while (last_row_new == last_row_old) is False:
            table = driver.find_element_by_xpath("//*[contains(@id, '--TilesTable-table')]/tbody")
            td_list = table.find_elements_by_xpath(".//tr/*[contains(@id, '-col1')]")
            for td in td_list:
                tile_title = td.text
                sh_tile = wb["Tuiles"]
                sh_tile.append([catalog, tile_title])
            last_row = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]")
            last_row_old = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
            last_row.click()
            last_row.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.5)
            last_row_new = driver.find_element_by_xpath(".//tr/*[contains(@id, '--TilesTable-rows-row19-col1')]").text
    except selenium.common.exceptions.NoSuchElementException:
        pass

extracting data from an HTML table using BeautifulSoup

Try this..

url = 'https://eresearch.fidelity.com/eresearch/markets_sectors/si_performance.jhtml'

industry = {'tab': 'industry'}
sector = {'tab': 'sector'}

r = requests.post(url, data=industry)
#soup = BeautifulSoup(response.content, 'html.parser')

#sectors = soup.find("table", id="perfTableSort")
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()

Now you can put data=industry or data=sector to get desired result..

Extracting Data from HTML Table