How to Parse HTML in Google Apps Script

How to parse an HTML string in Google Apps Script without using XmlService?

I have done this in vanilla js. Not real html parsing. Just try to get some content out of a string (url):

function getLKKBTC() {
var url = 'https://www.lykke.com/exchange';
var html = UrlFetchApp.fetch(url).getContentText();
var searchstring = '<td class="ask_BTCLKK">';
var index = html.search(searchstring);
if (index >= 0) {
var pos = index + searchstring.length
var rate = html.substring(pos, pos + 6);
rate = parseFloat(rate)
rate = 1/rate
return parseFloat(rate);
}
throw "Failed to fetch/parse data from " + url;
}

Parse HTML to retrieve specific tags value with Google Apps Script

How about using XmlService for your situation as a workaround? At XmlService, even if there are several line breaks in the tags, the value can be retrieved. I think that there are several workarounds for your situation. So please think of this as one of them.

The flow of sample script is as follows.

Flow :

  1. Add the header of xml and a root element tag to the html.
  2. Parse the creates xml value using XmlService.
  3. Retrieve the first value of tags using XmlService.

Sample script :

var html = '<b class="\n"\n>\nfoo\n</b><b class="\n"\n>\nvar\n</b>\n'; // Your sample value

var xml = '<?xml version="1.0"?><sampleContents>' + html + '</sampleContents>';
var res = XmlService.parse(xml).getRootElement().getChildren()[0].getText().trim();
Logger.log(res) // foo

Note :

  • In this sample script, your sample html was used. So if you use more complicated one, can you provide it? I would like to modify the script.

Reference :

  • XML Service

If this was not what you want, please tell me. I would like to modify it.

Edit 1 :

Unfortunately, for the value retrieved from the URL, above script cannot be used. So I used "Parser" which is a GAS library for your situation. The sample script is as follows.

Sample script :

var url = "https://www.booking.com/searchresults.ja.html?ss=kyoto&checkin_year=2018&checkin_month=10&checkin_monthday=1&checkout_year=2018&checkout_month=10&checkout_monthday=2&no_rooms=1&group_adults=1&group_children=0";
var html = UrlFetchApp.fetch(url).getContentText();
var res = Parser.data(html).from("<b class=\"\n\"\n>").to("</b>").build().trim();
Logger.log(res) // US$11

Note :

  • Before you run this script, please install "Parser". About the install of library, you can see it at here.

    • The project key of the library is M1lugvAXKKtUxn_vdAG9JZleS6DrsjUUV

References :

  • Parser
  • Managing libraries
  • google app script Exceeded memory limit
  • google script scrape parser with 2 classes with the same name

Edit 2 :

For your 2nd URL in your comment, it seems that the URL is different from your 1st one. And also your new URL has no tag of <b class=\"\n\"\n>. By this, the value you want cannot be retrieved. But from the 1st URL in your comment, I presumed about the value what you want. Please confirm the following script?

var url = "https://www.booking.com/searchresults.ja.html?ss=kyotogranvia&checkin_year=2018&checkin_month=10&checkin_monthday=1&checkout_year=2018&checkout_month=10&checkout_monthday=2&no_rooms=1&group_adults=1&group_children=0";
var html = UrlFetchApp.fetch(url).getContentText();
var res = Parser.data(html).from("<span class=\"lp-postcard-avg-price-value\">").to("</span>").build().trim();
Logger.log(res) // US$289

Parse Html using Google App Script

I found that the best way to parse html in google apps is to avoid using XmlService.parse or Xml.parse. XmlService.parse doesn't work well with bad html code from certain websites.

Here a basic example on how you can parse any website easily without using XmlService.parse or Xml.parse. In this example, i am retrieving a list of president from "wikipedia.org/wiki/President_of_the_United_States"
whit a regular javascript document.getElementsByTagName(), and pasting the values into my google spreadsheet.

1- Create a new Google Sheet;

2- Click the menu Tools > Script editor... to open a new tab with the code editor window and copy the following code into your Code.gs:

function onOpen() {
var ui = SpreadsheetApp.getUi();
ui.createMenu("Parse Menu")
.addItem("Parse", "parserMenuItem")
.addToUi();

}


function parserMenuItem() {
var sideBar = HtmlService.createHtmlOutputFromFile("test");
SpreadsheetApp.getUi().showSidebar(sideBar);
}


function getUrlData(url) {
var doc = UrlFetchApp.fetch(url).getContentText()
return doc
}

function writeToSpreadSheet(data) {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getSheets()[0];
var row=1

for (var i = 0; i < data.length; i++) {
var x = data[i];
var range = sheet.getRange(row, 1)
range.setValue(x);
var row = row+1
}
}

3- Add an HTML file to your Apps Script project. Open the Script Editor and choose File > New > Html File, and name it 'test'.Then copy the following code into your test.html

<!DOCTYPE html>
<html>
<head>
</head>
<body>
<input id= "mButon" type="button" value="Click here to get list"
onclick="parse()">
<div hidden id="mOutput"></div>
</body>
<script>

window.onload = onOpen;

function onOpen() {
var url = "https://en.wikipedia.org/wiki/President_of_the_United_States"
google.script.run.withSuccessHandler(writeHtmlOutput).getUrlData(url)
document.getElementById("mButon").style.visibility = "visible";
}

function writeHtmlOutput(x) {
document.getElementById('mOutput').innerHTML = x;
}

function parse() {

var list = document.getElementsByTagName("area");
var data = [];

for (var i = 0; i < list.length; i++) {
var x = list[i];
data.push(x.getAttribute("title"))
}

google.script.run.writeToSpreadSheet(data);
}
</script>
</html>

4- Save your gs and html files and Go back to your spreadsheet. Reload your Spreadsheet. Click on "Parse Menu" - "Parse". Then click on "Click here to get list" in the sidebar.

Import and parse a HTML file from Drive to Sheets

  • You want to put the values of the table from HTML data using Google Apps Script and/or the built-in functions of Spreadsheet.
  • The HTML files are put in your Google Drive.

If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.

Pattern 1:

In this pattern, IMPORTXML is used for the tables deployed with Web Apps.

Usage:

1. copy and paste the following script to the script editor.

function doGet(e) {
var fileId = e.parameter.id;
var html = DriveApp.getFileById(fileId).getBlob().getDataAsString();
var html = "<sample>" + html.match(/<table[\w\s\S]+?<\/table>/gi).join("") + "</sample>";
return ContentService.createTextOutput(html).setMimeType(ContentService.MimeType.XML);
}

2. Deploy Web Apps.

  1. On the script editor, Open a dialog box by "Publish" -> "Deploy as web app".
  2. Select "Me" for "Execute the app as:".
  3. Select "Anyone, even anonymous" for "Who has access to the app:".
  4. Click "Deploy" button as new "Project version".
  5. Automatically open a dialog box of "Authorization required".

    1. Click "Review Permissions".
    2. Select own account.
    3. Click "Advanced" at "This app isn't verified".
    4. Click "Go to ### project name ###(unsafe)"
    5. Click "Allow" button.
  6. Click "OK".
  7. Copy the URL of Web Apps. It's like https://script.google.com/macros/s/###/exec.

    • When you modified the Google Apps Script, please redeploy as new version. By this, the modified script is reflected to Web Apps. Please be careful this.

3. Put the formula.

Please put the following formula to a cell.

=IMPORTXML("https://script.google.com/macros/s/###/exec?id=###fileId###","//tr")
  • ###fileId### is the file ID of HTML file on Google Drive.

Pattern 2:

In this pattern, the HTML tables are retrieved from the HTML data, and the tables are put to the Spreadsheet using Sheets API.

Usage:

1. copy and paste the following script to the script editor.

Please set the variables of fileId, spreadsheetId and sheetName.

function myFunction() {
var fileId = "###"; // Please set the file ID of HTML file.
var spreadsheetId = "###"; // Please set the Spreadsheet ID for putting the values.
var sheetName = "Sheet1"; // Please set the sheet name for putting the values.


// Retrieve tables from HTML data.
var html = DriveApp.getFileById(fileId).getBlob().getDataAsString();
var values = html.match(/<table[\w\s\S]+?<\/table>/gi);

// Put the HTML tables to the Spreadsheet.
var ss = SpreadsheetApp.openById(spreadsheetId);
var sheet = ss.getSheetByName(sheetName);
var sheetId = sheet.getSheetId();
var rowIndex = 0;
values.forEach(function(e) {
var resource = {requests: [{pasteData: {html: true, data: e, coordinate: {sheetId: sheetId, rowIndex: rowIndex}}}]};
Sheets.Spreadsheets.batchUpdate(resource, spreadsheetId);
rowIndex = sheet.getLastRow();
})
}

2. Enable Sheets API.

Please enable Sheets API at Advanced Google services.

3. Run the script.

When you run the function myFunction, the values are retrieved from HTML data and they are put to the Spreadsheet.

Note:

  • These are the simple sample scripts. So please modify them for your actual situation.

References:

  • Web Apps
  • Taking advantage of Web Apps with Google Apps Script
  • Advanced Google services
  • spreadsheets.batchUpdate

Unfortunately, from your question, I cannot understand about your actual HTML data. So if an error occurs and this was not the direction you want, I apologize.

Parse HTML table from (Gmail) email body and paste to Google Sheets

I believe your current situation and your goal as follows.

  • The value of body of var body = message.getBody() is the value shown at and here's the console.log(body) section.
    • The HTML table of var body = message.getBody() is surely the value from <!-- sales_list --> to </table>.
  • You want to put the HTML table to the Spreadsheet.

In this case, I would like to propose to use Sheets API. When Sheets API is used, the HTML table can be automatically parsed and put it to Google Spreadsheet. When this is reflected to your script, it becomes as follows.

Modified script:

Before you use this script, please enable Sheets API at Advanced Google services.

function myFunction() {
var spreadsheetId = "###"; // Please set the Spreadsheet ID.
var sheetName = "###"; // Please set the sheet name.

var ss = SpreadsheetApp.openById(spreadsheetId);
var updateSheet = ss.getSheetByName(sheetName);
var body = `<!-- sales_list -->
<table style="border:0 none;border-spacing:0;border-collapse: collapse;word-break:normal;">
<tr style="background-color: #d1d1d1">
<th style="padding:8px">ID</th>
<th style="padding:8px">Commission</th>
<th style="padding:8px">Total Cost</th>
<th style="padding:8px">Order ID</th>
<th style="padding:8px">Product ID</th>
<th style="padding:8px">Created</th>
<th style="padding:8px">Campaign name</th>
<th style="padding:8px">Type</th>
<th style="padding:8px">Status</th>
<th style="padding:8px">Paid</th>
<th style="padding:8px">Affiliate</th>
<th style="padding:8px">Channel</th>
</tr>
<tr>
<td style="padding:8px">ny9kq352</td>
<td style="padding:8px">$ ‎30.00</td>
<td style="padding:8px">$ ‎500.00</td>
<td style="padding:8px">554683</td>
<td style="padding:8px">Thursday, April 08, 2021 :: Half Day Trip (PM) @Size Matters Charters</td>
<td style="padding:8px">04/07/2021</td>
<td style="padding:8px">Direct Links for Approved Affiliates</td>
<td style="padding:8px">Sales</td>
<td style="padding:8px">declined</td>
<td style="padding:8px">Unpaid</td>
<td style="padding:8px">Cathy Sheehan</td>
<td style="padding:8px"></td>
</tr>
<tr>
<td style="padding:8px">h4tdux7d</td>
<td style="padding:8px">$ ‎24.00</td>
<td style="padding:8px">$ ‎400.00</td>
<td style="padding:8px">553921</td>
<td style="padding:8px">Friday, April 09, 2021 :: 4 Hour Trip (AM) @R&R Fishing Charters</td>
<td style="padding:8px">04/07/2021</td>
<td style="padding:8px">Direct Links for Approved Affiliates</td>
<td style="padding:8px">Sales</td>
<td style="padding:8px">declined</td>
<td style="padding:8px">Unpaid</td>
<td style="padding:8px">Joanne Bergstrom</td>
<td style="padding:8px"></td>
</tr>
<tr>
<td style="padding:8px">qj9cfp0g</td>
<td style="padding:8px">$ ‎24.00</td>
<td style="padding:8px">$ ‎400.00</td>
<td style="padding:8px">553921</td>
<td style="padding:8px">Friday, April 09, 2021 :: 4 Hour Trip (AM) @R&R Fishing Charters</td>
<td style="padding:8px">04/07/2021</td>
<td style="padding:8px">Direct Links for Approved Affiliates</td>
<td style="padding:8px">Sales</td>
<td style="padding:8px">Pending</td>
<td style="padding:8px">Unpaid</td>
<td style="padding:8px">Joanne Bergstrom</td>
<td style="padding:8px"></td>
</tr>
<tr>
<td style="padding:8px">wujm6buw</td>
<td style="padding:8px">$ ‎39.00</td>
<td style="padding:8px">$ ‎650.00</td>
<td style="padding:8px">554032</td>
<td style="padding:8px">Tuesday, July 27, 2021 :: Half Day Trip (PM) @All Hanns On Deck</td>
<td style="padding:8px">04/06/2021</td>
<td style="padding:8px">Direct Links for Approved Affiliates</td>
<td style="padding:8px">Sales</td>
<td style="padding:8px">Pending</td>
<td style="padding:8px">Unpaid</td>
<td style="padding:8px">eric matechak</td>
<td style="padding:8px"></td>
</tr>
<tr>
<td style="padding:8px">ixonoun4</td>
<td style="padding:8px">$ ‎28.50</td>
<td style="padding:8px">$ ‎475.00</td>
<td style="padding:8px">554003</td>
<td style="padding:8px">Saturday, May 29, 2021 :: 4 Hour Trip (AM) @Fins Up Adventure Charters</td>
<td style="padding:8px">04/06/2021</td>
<td style="padding:8px">Direct Links for Approved Affiliates</td>
<td style="padding:8px">Sales</td>
<td style="padding:8px">Pending</td>
<td style="padding:8px">Unpaid</td>
<td style="padding:8px">eric matechak</td>
<td style="padding:8px"></td>
</tr>
</table>`;
console.log(body);
Sheets.Spreadsheets.batchUpdate({ requests: { pasteData: { html: true, data: body, coordinate: { sheetId: updateSheet.getSheetId(), rowIndex: 0, columnIndex: 0 } } } }, spreadsheetId);
}

Note:

  • At above modified script, the value of body shown in your question is directly used. When you want to use var body = message.getBody(), please use the following script. If the following script doesn't work, your sample value might be different from the actual body in the following script. At that time, can you provide the sample value of body for replicating the issue? By this, I would like to confirm it.

      function myFunction() {
    var spreadsheetId = "###"; // Please set the Spreadsheet ID.
    var sheetName = "###"; // Please set the sheet name.

    var ss = SpreadsheetApp.openById(spreadsheetId);
    var updateSheet = ss.getSheetByName(sheetName);
    var threads = GmailApp.search("Daily report (affiliate bookings)");
    var message = threads[0].getMessages()[0];
    var body = message.getBody();
    console.log(body);
    Sheets.Spreadsheets.batchUpdate({ requests: { pasteData: { html: true, data: body, coordinate: { sheetId: updateSheet.getSheetId(), rowIndex: 0, columnIndex: 0 } } } }, spreadsheetId);
    }

References:

  • Method: spreadsheets.batchUpdate
  • PasteDataRequest

Grab data from website HTML table and transfer to Google Sheets using App-Script

Solution by formula

Try

=importhtml(url,"table",1)

Sample Image

Other solution by script

function importTableHTML() {
var url = 'https://www.sports-reference.com/cbb/schools/indiana/2022-gamelogs.html'
var html = '<table' + UrlFetchApp.fetch(url, {muteHttpExceptions: true}).getContentText().replace(/(\r\n|\n|\r|\t| )/gm,"").match(/(?<=\<table).*(?=\<\/table)/g) + '</table>';
var trs = [...html.matchAll(/<tr[\s\S\w]+?<\/tr>/g)];
var data = [];
for (var i=0;i<trs.length;i++){
var tds = [...trs[i][0].matchAll(/<(td|th)[\s\S\w]+?<\/(td|th)>/g)];
var prov = [];
for (var j=0;j<tds.length;j++){
donnee=tds[j][0].match(/(?<=\>).*(?=\<\/)/g)[0];
prov.push(stripTags(donnee));
}
data.push(prov);
}
return(data);
}
function stripTags(body) {
var regex = /(<([^>]+)>)/ig;
return body.replace(regex,"");
}

Sample Image



Related Topics



Leave a reply



Submit