R: Web Scraping Yahoo.Finance After 2019 Change

R: web scraping yahoo.finance after 2019 change

As mentioned in the comment above, here is an alternative that tries to deal with the different table sizes published. I have worked on this and have had help from a friend.

library(rvest)
library(tidyverse)

url <- https://finance.yahoo.com/quote/AAPL/financials?p=AAPL

# Download the data
raw_table <- read_html(url) %>% html_nodes("div.D\\(tbr\\)")

number_of_columns <- raw_table[1] %>% html_nodes("span") %>% length()

if(number_of_columns > 1){
# Create empty data frame with the required dimentions
df <- data.frame(matrix(ncol = number_of_columns, nrow = length(raw_table)),
stringsAsFactors = F)

# Fill the table looping through rows
for (i in 1:length(raw_table)) {
# Find the row name and set it.
df[i, 1] <- raw_table[i] %>% html_nodes("div.Ta\\(start\\)") %>% html_text()
# Now grab the values
row_values <- raw_table[i] %>% html_nodes("div.Ta\\(end\\)")
for (j in 1:(number_of_columns - 1)) {
df[i, j+1] <- row_values[j] %>% html_text()
}
}
view(df)

How can I extract this specific table from this web page using R?

[QHarr] has answered the same question in this post I've copied the relevant code from their answer.

library(rvest)
library(stringr)
library(magrittr)

page <- read_html("https://finance.yahoo.com/quote/SKLZ/cash-flow?p=SKLZ")
nodes <- page %>%html_nodes(".fi-row")
df = NULL

for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}

df

Adjusting Yahoo Stock Data Web Scraping to Loop over Dates

Consider mapply or its non-simplified wrapper, Map, to iterate elementwise through the pairings of start and end dates and corresponding symbols. Also, avoid the use of assign and get and build list of data frames for final rbind at the end:

library(XML)
...
dateGroup <- data.frame(
start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)

# CROSS JOIN ALL SYMBOLS WITH EACH DATE PAIRING
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# DEFINED METHOD FOR HTML PROCESSING
proc_html <- function(sym, sd, ed) {
url <- paste0('https://finance.yahoo.com/quote/', sym, '/history?period1=',
sd, '&period2=', ed, '&interval=1d&filter=history&frequency=1d')
print(url)

webpage <- readLines(url, warn=FALSE)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")

html_df <- transform(readHTMLTable(tableNodes[[1]],
header=c("Date", "Open", "High", "Low",
"Close", "Adj. Close", "Volume")),
symbol = sym)
return(html_df)
}

# ITERATE ELEMENTWISE THROUGH EVERY ROW of dt_grp_sym
df_list <- Map(proc_html, dt_grp_sym$symbols, dt_grp_sym$start, dt_grp_sym$end)

final_df <- do.call(rbind, df_list)

To demonstrate using the Class 1 U.S. railroads:

symbols <- c("UNP", "CSX", "NSC", "CNI", "KSU")

dateGroup <- data.frame(
start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# CALLING SAME ABOVE FUNCTION
df_list <- with(dt_grp_sym, Map(proc_html, symbols, start, end))
final_df <- do.call(rbind, df_list)

Output

by(final_df, final_df$symbol, head)

# final_df$symbol: CNI
# Date Open High Low Close Adj..Close Volume symbol
# 998 Feb 08, 2018 76.08 76.16 74.11 74.45 72.79 1,508,100 CNI
# 999 Feb 07, 2018 76.86 77.23 76.01 76.17 74.48 1,645,400 CNI
# 1000 Feb 06, 2018 76.21 77.42 74.81 77.14 75.42 2,293,300 CNI
# 1001 Feb 05, 2018 78.00 78.70 77.12 77.17 75.45 1,711,000 CNI
# 1002 Feb 02, 2018 79.17 79.24 78.17 78.46 76.71 1,331,400 CNI
# 1003 Feb 01, 2018 79.91 80.54 79.24 79.82 78.04 1,231,500 CNI
# ------------------------------------------------------------------------------
# final_df$symbol: CSX
# Date Open High Low Close Adj..Close Volume symbol
# 333 Feb 08, 2018 52.91 53.16 50.46 50.47 49.80 7,798,100 CSX
# 334 Feb 07, 2018 53.38 54.36 52.94 52.97 52.26 6,496,200 CSX
# 335 Feb 06, 2018 51.27 54.00 50.12 53.82 53.10 10,563,700 CSX
# 336 Feb 05, 2018 54.89 55.04 51.96 51.99 51.30 9,070,200 CSX
# 337 Feb 02, 2018 56.19 56.35 55.20 55.25 54.51 9,275,800 CSX
# 338 Feb 01, 2018 56.10 57.10 56.04 56.58 55.83 4,079,100 CSX
# ------------------------------------------------------------------------------
# final_df$symbol: KSU
# Date Open High Low Close Adj..Close Volume symbol
# 1330 Feb 08, 2018 107.17 107.64 103.50 103.53 102.15 1,434,600 KSU
# 1331 Feb 07, 2018 106.59 108.27 106.59 107.10 105.67 1,326,800 KSU
# 1332 Feb 06, 2018 103.11 108.02 102.07 107.32 105.89 1,459,400 KSU
# 1333 Feb 05, 2018 109.73 110.44 105.12 105.18 103.77 1,272,100 KSU
# 1334 Feb 02, 2018 112.06 112.85 110.03 110.15 108.68 1,051,900 KSU
# 1335 Feb 01, 2018 112.80 114.00 112.17 112.87 111.36 1,011,200 KSU
# ------------------------------------------------------------------------------
# final_df$symbol: NSC
# Date Open High Low Close Adj..Close Volume symbol
# 665 Feb 08, 2018 142.62 143.27 136.87 136.89 134.22 2,657,200 NSC
# 666 Feb 07, 2018 142.09 144.45 141.37 142.68 139.89 1,464,500 NSC
# 667 Feb 06, 2018 136.99 143.45 134.55 143.05 140.26 2,455,000 NSC
# 668 Feb 05, 2018 144.74 146.73 138.18 138.61 135.90 2,508,900 NSC
# 669 Feb 02, 2018 147.15 147.85 144.61 145.03 142.20 1,774,600 NSC
# 670 Feb 01, 2018 149.28 150.35 147.90 148.47 145.57 1,427,000 NSC
# ------------------------------------------------------------------------------
# final_df$symbol: UNP
# Date Open High Low Close Adj..Close Volume symbol
# 1 Feb 08, 2018 128.70 128.70 124.81 124.86 122.27 6,325,100 UNP
# 2 Feb 07, 2018 130.34 131.82 128.94 128.96 126.29 5,053,000 UNP
# 3 Feb 06, 2018 122.28 131.50 121.50 131.15 128.43 15,734,300 UNP
# 4 Feb 05, 2018 128.59 131.78 124.13 124.14 121.57 6,744,400 UNP
# 5 Feb 02, 2018 131.66 131.73 127.22 129.36 126.68 8,181,200 UNP
# 6 Feb 01, 2018 132.51 133.74 131.86 132.38 129.64 5,597,600 UNP

How to pull financial statements from Yahoo Finance (2019 URL) in VBA

the code works just fine for MSFT in so far as it works the same way for that ticker as it does for SBUX. The code you linked to is for retrieving balance sheet info for a given ticker.

https://finance.yahoo.com/quote/SBUX/balance-sheet?p=SBUX

or

https://finance.yahoo.com/quote/MSFT/balance-sheet?p=MSFT

This does not guarantee you can 'lift and shift' this code for use with any of the other tabs e.g. income statement which has the following construction:

https://finance.yahoo.com/quote/MSFT/financials?p=MSFT

You will need to inspect the html of these tabs and see how it differs. There are already existing answers on StackOverflow covering how to obtain the data as shown in the other tabs (and by the different time periods e.g. Quarter).


VBA translation of existing answer. In VBA it would benefit from re-factoring:

Option Explicit

Public Sub WriteOutFinancialInfo()
Dim http As Object, s As String

Set http = CreateObject("MSXML2.XMLHTTP")

With http
.Open "GET", "https://finance.yahoo.com/quote/MSFT/financials?p=MSFT", False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send
s = .responseText
End With

Dim html As MSHTML.HTMLDocument, html2 As MSHTML.HTMLDocument, re As Object, matches As Object

Set html = New MSHTML.HTMLDocument: Set html2 = New MSHTML.HTMLDocument
Set re = CreateObject("VBScript.RegExp")

html.body.innerHTML = s

Dim headers(), rows As Object

headers = Array("Breakdown", "TTM")
Set rows = html.querySelectorAll(".fi-row")

With re
.Global = True
.MultiLine = True
.Pattern = "\d{1,2}/\d{1,2}/\d{4}"
Set matches = .Execute(s)
End With

Dim results(), match As Object, r As Long, c As Long, startHeaderCount As Long
startHeaderCount = UBound(headers)
ReDim Preserve headers(0 To matches.Count + startHeaderCount)

c = 1
For Each match In matches
headers(startHeaderCount + c) = match
c = c + 1
Next

Dim row As Object
ReDim results(1 To rows.Length, 1 To UBound(headers) + 1)

For r = 0 To rows.Length - 1
html2.body.innerHTML = rows.Item(r).outerHTML
Set row = html2.querySelectorAll("[title],[data-test=fin-col]")

For c = 0 To row.Length - 1
results(r + 1, c + 1) = row.Item(c).innerText
Next c
Next

Dim ws As Worksheet

Set ws = ThisWorkbook.Worksheets("Sheet1")

With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub

Project references:

VBE > Tools > References > Add reference to Microsoft HTML Object Library

Web Scraping information from Yahoo Finance

Copy the below JSON; instead of scraping just use the JSON array to access it in your code. You don't need to scrape continuously as this is old data that is not going to change; therefor the best option is store it and access it anytime.

Sample JSON object:

 {
"Date": "12/28/2017",
"Open": 171,
"High": 171.850006,
"Low": 170.479996,
"Close": 171.080002,
"Adj Close": 168.549545,
"Volume": 16480200
}

I put the entire JSON up for you here:
http://ilanpatao.com/misc/aapl.json

Web scraping from Yahoo Finance: cannot access individual tabs within a web page

That data is already present. It is simply loaded from a script tag when you click. You can regex out the appropriate string and parse with json library. You will need to study the json to determine which access paths to use. The data is within

data['context']['dispatcher']['stores']['QuoteSummaryStore']

Examples below:

import requests, re, json

p = re.compile(r'root\.App\.main = (.*);')
r = requests.get('https://finance.yahoo.com/quote/GOOG/financials?p=GOOG&.tsrc=fin-srch&guccounter=1')
data = json.loads(p.findall(r.text)[0])
quote_store = data['context']['dispatcher']['stores']['QuoteSummaryStore']
print(quote_store['earnings']['financialsChart']['quarterly'])
print(quote_store['incomeStatementHistoryQuarterly']['incomeStatementHistory'])

You can quick view the sections:

Sample Image

How to scraping table using html_table in R if there is no table tag?

The discussion at "[https://stackoverflow.com/questions/58315274/r-web-scraping-yahoo-finance-after-2019-change][1]" addresses your issue. Based on the discussion in the link, you can obtain the information as follows for "AAPL":

library(rvest)
library(tidyverse)

tic <- "AAPL"
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic, "/financials?p=", tic)
wahis.session <- html_session(link)
p <- wahis.session
nodes <- p %>% html_nodes(".fi-row")

df = NULL

for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}

matches <- str_match_all(p1%>%html_node('#Col1-1-Financials-Proxy')%>%html_text(),'\\d{1,2}/\\d{1,2}/\\d{4}')
headers <- c('Breakdown','TTM', matches[[1]][,1])
names(df) <- headers

Scraping historical data from Yahoo Finance with Python

I wrote this to get historical data from YF directly from the download csv link. It needs to make two requests, one to get the cookie and the crumb and another one to get the data. It returns a pandas dataframe

import re
from io import StringIO
from datetime import datetime, timedelta

import requests
import pandas as pd

class YahooFinanceHistory:
timeout = 2
crumb_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{quote}?period1={dfrom}&period2={dto}&interval=1d&events=history&crumb={crumb}'

def __init__(self, symbol, days_back=7):
self.symbol = symbol
self.session = requests.Session()
self.dt = timedelta(days=days_back)

def get_crumb(self):
response = self.session.get(self.crumb_link.format(self.symbol), timeout=self.timeout)
response.raise_for_status()
match = re.search(self.crumble_regex, response.text)
if not match:
raise ValueError('Could not get crumb from Yahoo Finance')
else:
self.crumb = match.group(1)

def get_quote(self):
if not hasattr(self, 'crumb') or len(self.session.cookies) == 0:
self.get_crumb()
now = datetime.utcnow()
dateto = int(now.timestamp())
datefrom = int((now - self.dt).timestamp())
url = self.quote_link.format(quote=self.symbol, dfrom=datefrom, dto=dateto, crumb=self.crumb)
response = self.session.get(url)
response.raise_for_status()
return pd.read_csv(StringIO(response.text), parse_dates=['Date'])

You can use it like this:

df = YahooFinanceHistory('AAPL', days_back=30).get_quote()


Related Topics



Leave a reply



Submit