How to Convert an HTML Table to Csv

HTML Table to CSV

As commented, converting to XML has strict rules and when the HTML neglets to write a closing tag </tr> loading it as xml will fail.. Same goes for the <img> tags that have no end tag </img>.

I do not have the full html you are loading, but perhaps try below function instead:

function ConvertFrom_HtmlTable {
# adapted from: https://www.leeholmes.com/blog/2015/01/05/extracting-tables-from-powershells-invoke-webrequest/
[CmdletBinding(DefaultParameterSetName = 'ByIndex')]
param(
[Parameter(Mandatory = $true, Position = 0)]
[Microsoft.PowerShell.Commands.HtmlWebResponseObject]$WebRequest,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByIndex')]
[int]$TableIndex = 0,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ById')]
[string]$TableId,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByName')]
[string]$TableName,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByClass')]
[string]$TableClassName
)

# Extract the table out of the web request
switch ($PSCmdlet.ParameterSetName) {
'ById' { $table = $WebRequest.ParsedHtml.getElementByID($TableId) }
'ByIndex' { $table = @($WebRequest.ParsedHtml.getElementsByTagName('table'))[$TableIndex]}
'ByName' { $table = @($WebRequest.ParsedHtml.getElementsByName($TableName))[0] }
'ByClass' { $table = @($WebRequest.ParsedHtml.getElementsByClassName($TableClassName))[0] }
}
if (!$table) {
Write-Warning "Could not find the given table."
return $null
}

# load the System.Web assembly to be able to decode HTML entities
Add-Type -AssemblyName System.Web

$headers = @()
# Go through all of the rows in the table
foreach ($row in $table.Rows) {
$cells = @($row.Cells)
# If there is a table header, remember its titles
if($cells[0].tagName -eq "TH") {
$i = 0
$headers = @($cells | ForEach-Object {
$i++
# decode HTML entities and double-up quotes that the value may contain
$th = ([System.Web.HttpUtility]::HtmlDecode($_.InnerText) -replace '"', '""').Trim()
# if the table header is empty, create it
if ([string]::IsNullOrEmpty($th)) { "H$i" } else { $th }
})
# proceed with the next row
continue
}
# if we haven't found any table headers, make up names "H1", "H2", etc.
if(-not $headers) {
$headers = @(1..($cells.Count + 2) | ForEach-Object { "H$_" })
}

# Now go through the cells in the the row. For each, try to find the
# title that represents that column and create a hashtable mapping those
# titles to content
$hash = [Ordered]@{}
for ($i = 0; $i -lt $cells.Count; $i++) {
# decode HTML entities and double-up quotes that the value may contain
$value = ([System.Web.HttpUtility]::HtmlDecode($cells[$i].InnerText) -replace '"', '""').Trim()
$th = $headers[$i]
$hash[$th] = $value.Trim()
}
# And finally cast that hashtable to a PSCustomObject
[PSCustomObject]$hash
}
}

Call it like this:

$request = Invoke-WebRequest $uri
$table = ConvertFrom_HtmlTable -WebRequest $request -TableClassName 'organization-admin__table table'

or use the TableIndex parameter if you know it is the first or xth table in the html, as it apparently has no id or name

If that succeeds, you can simply write to csv:

$table | Export-Csv -Path 'X:\path\to\theTable.csv' -NoTypeInformation

From your comment, it seems that you cannot for some reason use Invoke-WebRequest and have to parse using the IE com object.

Try this version of the function instead:

function ConvertFrom_HtmlTable {
[CmdletBinding(DefaultParameterSetName = 'ByIndex')]
param(
[Parameter(ValueFromPipeline = $true, Mandatory = $true, Position = 0)]
[string]$Url,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByIndex')]
[int]$TableIndex = 0,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ById')]
[string]$TableId,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByName')]
[string]$TableName,

[Parameter(Mandatory = $false, Position = 1, ParameterSetName = 'ByClass')]
[string]$TableClassName,

[switch]$FirstRowHasHeaders
)

$ie = New-Object -ComObject 'InternetExplorer.Application'
$ie.Visible = $false
$ie.Silent = $true
$ie.Navigate($Url)
# wait for IE to fully load the document
while($ie.Busy) { Start-Sleep -Milliseconds 100 }

$doc = $ie.Document

switch ($PSCmdlet.ParameterSetName) {
'ById' { $table = $doc.IHTMLDocument3_getElementByID($TableId) }
'ByIndex' { $table = @($doc.IHTMLDocument3_getElementsByTagName('table'))[$TableIndex]}
'ByName' { $table = @($doc.IHTMLDocument3_getElementsByName($TableName))[0] }
'ByClass' { $table = @($doc.IHTMLDocument3_getElementsByClassName($TableClassName))[0] }
}

if ($table) {
# Extracting table rows as a collection.
$tbody = $table.childNodes | Where-Object { $_.tagName -eq "tbody" }
if ($tbody) {
$rows = $tbody.childNodes | Where-Object { $_.tagName -eq "tr" }
}
else {
$rows = $table.childNodes | Where-Object { $_.tagName -eq "tr" }
}

# read or create table headers
# assume the first row has headers either in <th> or <td> tags
$firstRow = 1
$headers = @($rows[0].childNodes | Where-Object { $_.tagName -eq "th" } | Foreach-Object { $_.innerHTML })

if (!($headers)) {
# there were no <th> tags found, so either use the first row as headers or create from scratch
$values = @($rows[0].childNodes | Where-Object { $_.tagName -eq "td" } | Foreach-Object { $_.innerHTML })
if ($FirstRowHasHeaders) {
# the headers are considered to be the values from the first row
$headers = $values
}
else {
# the table has no headers, so dynamically create them
$firstRow = 0
$headers = for ($i = 1; $i -le $values.Count; $i++) { "Column_$i" }
}
}

# create a List object to store the values found as PSObjects
$result = [System.Collections.Generic.List[object]]::new()
for ($i = $firstRow; $i -lt $rows.Count; $i++) {
$values = @($rows[$i].childNodes | Where-Object { $_.tagName -eq "td" } | Foreach-Object { $_.innerHTML })
$valuesCount = $values.Count
while ($headers.Count -lt $valuesCount) {
$colName = "Column_{0}" -f ($headers.Count + 1)
$headers += $colName
# we have just added a new header column. Make sure the first item also has this new column
if ($result.Count) {
$result[0] | Add-Member -MemberType NoteProperty -Name $colName -Value $null
}
}
# create a Hashtable to get store the values
$data = [ordered]@{}
for ($j = 0; $j -lt $valuesCount; $j++) { $data[$headers[$j]] = $values[$j] }
# add the hash cast to PsCustomObject to the list
$result.Add(([PsCustomObject]$data))
}
}
else { Write-Warning "Could not find the given table." }

# quit IE and clean up
$ie.Quit()
$null = [System.Runtime.Interopservices.Marshal]::ReleaseComObject($ie)
[System.GC]::Collect()
[System.GC]::WaitForPendingFinalizers()

return $result
}

$table = ConvertFrom_HtmlTable -Url 'your URL here' -TableClassName 'organization-admin__table'
$table | Export-Csv -Path 'X:\path\to\theTable.csv' -NoTypeInformation

The 2nd function using the InternetExplorer.Application COM object needs to find the table object using the DOM.
For that, the function currently uses the IHTMLDocument3 interface, which for me on Windows 10 Pro, PowerShell 5.1 and IE version 11.789.19041.0 works when I test on for instance

ConvertFrom_HtmlTable -Url 'https://www.w3schools.com/html/html_tables.asp' -TableId 'customers'

According to your comment, you are receiving error message :

Method invocation failed because [mshtml.HTMLDocumentClass] does not
contain a method named 'IHTMLDocument3_getElementsByClassName'.

This means you have a different (not updated/ broken) version on your machine and you will have to try for yourself which method works:

  1. test first what version of IE you have by typing this in a PowerShell console:
(Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Internet Explorer').SvcVersion

If that returns blank, try

(Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Internet Explorer').Version

  1. Next, inside the switch change the methods used from
$doc.IHTMLDocument3_getElementsByClassName($TableClassName)

to

$doc.getElementsByClassName($TableClassName)

or

$doc.documentElement.getElementsByClassName($TableClassName)

If all of that fails, I'm afraid you have a serious problem on your computer (maybe the reason for Invoke-Webrequest not working also ?). Try to fix that with fsc /scannow

How to convert HTML table to CSV? [Python]

The big thing I see that's missing in your sample code is that you're not iterating td elements inside every row element, maybe the for sub_element in element line does the cell iteration, but it's not clear. For yourself, and anyone else who needs to read your code (like, us ), I recommend being very explicit with finding and iterating elements.

I don't have BeautifulSoup (BS) or Pandas installed, but I'd like to offer the following as a template for explicitly traversing your table's hierarchy. I'm using Python's standard xml and csv modules. (I think the BS API is similar enough to ElementTree to guide you)

Here's a very simple HTML with a table, input.html:

<html>
<body>
<table>
<tr><td>Col1</td><td>Col2</td><td>Col3</td></tr>
<tr><td>Row1Col1</td><td>Row1Col2</td><td>Row1Col3</td></tr>
<tr><td>Row2Col1</td><td>Row2Col2</td><td>Row2Col3</td></tr>
<tr><td>Row3Col1</td><td>Row3Col2</td><td>Row3Col3</td></tr>
<tr><td>Row4Col1</td><td>Row4Col2</td><td>Row4Col3</td></tr>
</table>
</body>
</html>
import csv
from xml.etree import ElementTree as ET

# Accumulate rows from the table
all_rows = []

root = ET.parse('input.html')

# Get the table once
my_table = root.find('.//table')

# Iterate rows (tr) for that table
for elem_tr in my_table.findall('tr'):
new_row = []

# Iterate cells (td) per row
for elem_td in elem_tr.findall('td'):
# Build your row cell-by-cell
new_row.append(elem_td.text)

# Save finished row
all_rows.append(new_row)

# Finally write all rows
with open('out.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(all_rows)

When I run that, here's my out.csv:

Col1,Col2,Col3
Row1Col1,Row1Col2,Row1Col3
Row2Col1,Row2Col2,Row2Col3
Row3Col1,Row3Col2,Row3Col3
Row4Col1,Row4Col2,Row4Col3

How do I convert an HTML table to csv format?

Here is some perl script. And online converter (java needed). Or you can just manually copy HTML table from browser to OpenOffice Calc / MS Excel and save it as CSV - this will work too.

And here you are step-by-step solution on PHP.

How to export or convert HTML table to CSV?

tabletoCSV is a minimalist jQuery tool used to convert / export an html table into a CSV (Comma Separated Values) file. Just use these three simple steps and CSV will export.

1. Include jQuery library and the jQuery tabletoCSV plugin on the web page.

<script src="http://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
<script src="http://www.jqueryscript.net/demo/jQuery-Plugin-To-Convert-HTML-Table-To-CSV-tabletoCSV/jquery.tabletoCSV.js"></script>

2. Add a button for table to CSV export.

<button id="export" data-export="export">Export</button>

3. Enable the Table To CSV tool.

$("#export").click(function(){
$("table").tableToCSV();
});

How to convert specific HTML Table with specific columns/values to Excel using JavaScript and Puppeteer?

This solution has been tested on your sample HTML page. You will have to craft the CSS selector to be specific enough on the actual page. And/or filter the resulting elementHandles:

const puppeteer = require('puppeteer');

(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://renatocfrancisco.github.io/getTableWithPuppeteer/');

// Get array of elementHandles from page matching this CSS selector.
const elements = await page.$$('table table td');

const columns = [];
const values = [];

// Process the elementHandles.
let i = 0;
for (const element of elements) {
// Extract text from elementHandle.
const text = await element.evaluate(el => el.textContent);
if ((i++ % 2) == 0) {
// Even elements are columns.
columns.push(text);
} else {
// Odd elements are values.
values.push(text);
}
};

// Construct CSV string.
console.log(`${columns.join(',')}\n${values.join(',')}`);

await browser.close();
})();

Output:


$ node ./src/index.js
column1,column2,column3
value1,value2,value3

update:

This is a variation that doesn't use Puppeteer; only fetch and Cheerio.js. It is much faster and uses less resources, so I recommend using this version when possible. (Your sample URL can be processed fine without Puppeteer.)

const cheerio = require('cheerio');

(async () => {
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const response = await fetch('https://renatocfrancisco.github.io/getTableWithPuppeteer/');
const html = await response.text();
const $ = cheerio.load(html);

// Get array of elements from page matching this CSS selector.
const elements = $('table table td');

const columns = [];
const values = [];

// Process the elementHandles.
let i = 0;
for (const element of elements) {
// Extract text from elementHandle.
const text = $(element).text();
if ((i++ % 2) == 0) {
// Even elements are columns.
columns.push(text);
} else {
// Odd elements are values.
values.push(text);
}
};

// Construct CSV string.
console.log(`${columns.join(',')}\n${values.join(',')}`);
})();

Output:


$ node ./src/index.js
column1,column2,column3
value1,value2,value3


Related Topics



Leave a reply



Submit