R: web scraping yahoo.finance after 2019 change
As mentioned in the comment above, here is an alternative that tries to deal with the different table sizes published. I have worked on this and have had help from a friend.
library(rvest)
library(tidyverse)
url <- https://finance.yahoo.com/quote/AAPL/financials?p=AAPL
# Download the data
raw_table <- read_html(url) %>% html_nodes("div.D\\(tbr\\)")
number_of_columns <- raw_table[1] %>% html_nodes("span") %>% length()
if(number_of_columns > 1){
# Create empty data frame with the required dimentions
df <- data.frame(matrix(ncol = number_of_columns, nrow = length(raw_table)),
stringsAsFactors = F)
# Fill the table looping through rows
for (i in 1:length(raw_table)) {
# Find the row name and set it.
df[i, 1] <- raw_table[i] %>% html_nodes("div.Ta\\(start\\)") %>% html_text()
# Now grab the values
row_values <- raw_table[i] %>% html_nodes("div.Ta\\(end\\)")
for (j in 1:(number_of_columns - 1)) {
df[i, j+1] <- row_values[j] %>% html_text()
}
}
view(df)
How can I extract this specific table from this web page using R?
[QHarr] has answered the same question in this post I've copied the relevant code from their answer.
library(rvest)
library(stringr)
library(magrittr)
page <- read_html("https://finance.yahoo.com/quote/SKLZ/cash-flow?p=SKLZ")
nodes <- page %>%html_nodes(".fi-row")
df = NULL
for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}
df
Adjusting Yahoo Stock Data Web Scraping to Loop over Dates
Consider mapply
or its non-simplified wrapper, Map
, to iterate elementwise through the pairings of start and end dates and corresponding symbols. Also, avoid the use of assign
and get
and build list of data frames for final rbind
at the end:
library(XML)
...
dateGroup <- data.frame(
start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)
# CROSS JOIN ALL SYMBOLS WITH EACH DATE PAIRING
dt_grp_sym <- merge(dateGroup, data.frame(symbols))
# DEFINED METHOD FOR HTML PROCESSING
proc_html <- function(sym, sd, ed) {
url <- paste0('https://finance.yahoo.com/quote/', sym, '/history?period1=',
sd, '&period2=', ed, '&interval=1d&filter=history&frequency=1d')
print(url)
webpage <- readLines(url, warn=FALSE)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
html_df <- transform(readHTMLTable(tableNodes[[1]],
header=c("Date", "Open", "High", "Low",
"Close", "Adj. Close", "Volume")),
symbol = sym)
return(html_df)
}
# ITERATE ELEMENTWISE THROUGH EVERY ROW of dt_grp_sym
df_list <- Map(proc_html, dt_grp_sym$symbols, dt_grp_sym$start, dt_grp_sym$end)
final_df <- do.call(rbind, df_list)
To demonstrate using the Class 1 U.S. railroads:
symbols <- c("UNP", "CSX", "NSC", "CNI", "KSU")
dateGroup <- data.frame(
start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)
dt_grp_sym <- merge(dateGroup, data.frame(symbols))
# CALLING SAME ABOVE FUNCTION
df_list <- with(dt_grp_sym, Map(proc_html, symbols, start, end))
final_df <- do.call(rbind, df_list)
Output
by(final_df, final_df$symbol, head)
# final_df$symbol: CNI
# Date Open High Low Close Adj..Close Volume symbol
# 998 Feb 08, 2018 76.08 76.16 74.11 74.45 72.79 1,508,100 CNI
# 999 Feb 07, 2018 76.86 77.23 76.01 76.17 74.48 1,645,400 CNI
# 1000 Feb 06, 2018 76.21 77.42 74.81 77.14 75.42 2,293,300 CNI
# 1001 Feb 05, 2018 78.00 78.70 77.12 77.17 75.45 1,711,000 CNI
# 1002 Feb 02, 2018 79.17 79.24 78.17 78.46 76.71 1,331,400 CNI
# 1003 Feb 01, 2018 79.91 80.54 79.24 79.82 78.04 1,231,500 CNI
# ------------------------------------------------------------------------------
# final_df$symbol: CSX
# Date Open High Low Close Adj..Close Volume symbol
# 333 Feb 08, 2018 52.91 53.16 50.46 50.47 49.80 7,798,100 CSX
# 334 Feb 07, 2018 53.38 54.36 52.94 52.97 52.26 6,496,200 CSX
# 335 Feb 06, 2018 51.27 54.00 50.12 53.82 53.10 10,563,700 CSX
# 336 Feb 05, 2018 54.89 55.04 51.96 51.99 51.30 9,070,200 CSX
# 337 Feb 02, 2018 56.19 56.35 55.20 55.25 54.51 9,275,800 CSX
# 338 Feb 01, 2018 56.10 57.10 56.04 56.58 55.83 4,079,100 CSX
# ------------------------------------------------------------------------------
# final_df$symbol: KSU
# Date Open High Low Close Adj..Close Volume symbol
# 1330 Feb 08, 2018 107.17 107.64 103.50 103.53 102.15 1,434,600 KSU
# 1331 Feb 07, 2018 106.59 108.27 106.59 107.10 105.67 1,326,800 KSU
# 1332 Feb 06, 2018 103.11 108.02 102.07 107.32 105.89 1,459,400 KSU
# 1333 Feb 05, 2018 109.73 110.44 105.12 105.18 103.77 1,272,100 KSU
# 1334 Feb 02, 2018 112.06 112.85 110.03 110.15 108.68 1,051,900 KSU
# 1335 Feb 01, 2018 112.80 114.00 112.17 112.87 111.36 1,011,200 KSU
# ------------------------------------------------------------------------------
# final_df$symbol: NSC
# Date Open High Low Close Adj..Close Volume symbol
# 665 Feb 08, 2018 142.62 143.27 136.87 136.89 134.22 2,657,200 NSC
# 666 Feb 07, 2018 142.09 144.45 141.37 142.68 139.89 1,464,500 NSC
# 667 Feb 06, 2018 136.99 143.45 134.55 143.05 140.26 2,455,000 NSC
# 668 Feb 05, 2018 144.74 146.73 138.18 138.61 135.90 2,508,900 NSC
# 669 Feb 02, 2018 147.15 147.85 144.61 145.03 142.20 1,774,600 NSC
# 670 Feb 01, 2018 149.28 150.35 147.90 148.47 145.57 1,427,000 NSC
# ------------------------------------------------------------------------------
# final_df$symbol: UNP
# Date Open High Low Close Adj..Close Volume symbol
# 1 Feb 08, 2018 128.70 128.70 124.81 124.86 122.27 6,325,100 UNP
# 2 Feb 07, 2018 130.34 131.82 128.94 128.96 126.29 5,053,000 UNP
# 3 Feb 06, 2018 122.28 131.50 121.50 131.15 128.43 15,734,300 UNP
# 4 Feb 05, 2018 128.59 131.78 124.13 124.14 121.57 6,744,400 UNP
# 5 Feb 02, 2018 131.66 131.73 127.22 129.36 126.68 8,181,200 UNP
# 6 Feb 01, 2018 132.51 133.74 131.86 132.38 129.64 5,597,600 UNP
How to pull financial statements from Yahoo Finance (2019 URL) in VBA
the code works just fine for MSFT in so far as it works the same way for that ticker as it does for SBUX. The code you linked to is for retrieving balance sheet info for a given ticker.
https://finance.yahoo.com/quote/SBUX/balance-sheet?p=SBUX
or
https://finance.yahoo.com/quote/MSFT/balance-sheet?p=MSFT
This does not guarantee you can 'lift and shift' this code for use with any of the other tabs e.g. income statement which has the following construction:
https://finance.yahoo.com/quote/MSFT/financials?p=MSFT
You will need to inspect the html of these tabs and see how it differs. There are already existing answers on StackOverflow covering how to obtain the data as shown in the other tabs (and by the different time periods e.g. Quarter).
VBA translation of existing answer. In VBA it would benefit from re-factoring:
Option Explicit
Public Sub WriteOutFinancialInfo()
Dim http As Object, s As String
Set http = CreateObject("MSXML2.XMLHTTP")
With http
.Open "GET", "https://finance.yahoo.com/quote/MSFT/financials?p=MSFT", False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send
s = .responseText
End With
Dim html As MSHTML.HTMLDocument, html2 As MSHTML.HTMLDocument, re As Object, matches As Object
Set html = New MSHTML.HTMLDocument: Set html2 = New MSHTML.HTMLDocument
Set re = CreateObject("VBScript.RegExp")
html.body.innerHTML = s
Dim headers(), rows As Object
headers = Array("Breakdown", "TTM")
Set rows = html.querySelectorAll(".fi-row")
With re
.Global = True
.MultiLine = True
.Pattern = "\d{1,2}/\d{1,2}/\d{4}"
Set matches = .Execute(s)
End With
Dim results(), match As Object, r As Long, c As Long, startHeaderCount As Long
startHeaderCount = UBound(headers)
ReDim Preserve headers(0 To matches.Count + startHeaderCount)
c = 1
For Each match In matches
headers(startHeaderCount + c) = match
c = c + 1
Next
Dim row As Object
ReDim results(1 To rows.Length, 1 To UBound(headers) + 1)
For r = 0 To rows.Length - 1
html2.body.innerHTML = rows.Item(r).outerHTML
Set row = html2.querySelectorAll("[title],[data-test=fin-col]")
For c = 0 To row.Length - 1
results(r + 1, c + 1) = row.Item(c).innerText
Next c
Next
Dim ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Project references:
VBE > Tools > References > Add reference to Microsoft HTML Object Library
Web Scraping information from Yahoo Finance
Copy the below JSON; instead of scraping just use the JSON array to access it in your code. You don't need to scrape continuously as this is old data that is not going to change; therefor the best option is store it and access it anytime.
Sample JSON object:
{
"Date": "12/28/2017",
"Open": 171,
"High": 171.850006,
"Low": 170.479996,
"Close": 171.080002,
"Adj Close": 168.549545,
"Volume": 16480200
}
I put the entire JSON up for you here:
http://ilanpatao.com/misc/aapl.json
Web scraping from Yahoo Finance: cannot access individual tabs within a web page
That data is already present. It is simply loaded from a script tag when you click. You can regex out the appropriate string and parse with json library. You will need to study the json to determine which access paths to use. The data is within
data['context']['dispatcher']['stores']['QuoteSummaryStore']
Examples below:
import requests, re, json
p = re.compile(r'root\.App\.main = (.*);')
r = requests.get('https://finance.yahoo.com/quote/GOOG/financials?p=GOOG&.tsrc=fin-srch&guccounter=1')
data = json.loads(p.findall(r.text)[0])
quote_store = data['context']['dispatcher']['stores']['QuoteSummaryStore']
print(quote_store['earnings']['financialsChart']['quarterly'])
print(quote_store['incomeStatementHistoryQuarterly']['incomeStatementHistory'])
You can quick view the sections:
How to scraping table using html_table in R if there is no table tag?
The discussion at "[https://stackoverflow.com/questions/58315274/r-web-scraping-yahoo-finance-after-2019-change][1]" addresses your issue. Based on the discussion in the link, you can obtain the information as follows for "AAPL":
library(rvest)
library(tidyverse)
tic <- "AAPL"
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic, "/financials?p=", tic)
wahis.session <- html_session(link)
p <- wahis.session
nodes <- p %>% html_nodes(".fi-row")
df = NULL
for(i in nodes){
r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}
matches <- str_match_all(p1%>%html_node('#Col1-1-Financials-Proxy')%>%html_text(),'\\d{1,2}/\\d{1,2}/\\d{4}')
headers <- c('Breakdown','TTM', matches[[1]][,1])
names(df) <- headers
Scraping historical data from Yahoo Finance with Python
I wrote this to get historical data from YF directly from the download csv link. It needs to make two requests, one to get the cookie and the crumb and another one to get the data. It returns a pandas dataframe
import re
from io import StringIO
from datetime import datetime, timedelta
import requests
import pandas as pd
class YahooFinanceHistory:
timeout = 2
crumb_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{quote}?period1={dfrom}&period2={dto}&interval=1d&events=history&crumb={crumb}'
def __init__(self, symbol, days_back=7):
self.symbol = symbol
self.session = requests.Session()
self.dt = timedelta(days=days_back)
def get_crumb(self):
response = self.session.get(self.crumb_link.format(self.symbol), timeout=self.timeout)
response.raise_for_status()
match = re.search(self.crumble_regex, response.text)
if not match:
raise ValueError('Could not get crumb from Yahoo Finance')
else:
self.crumb = match.group(1)
def get_quote(self):
if not hasattr(self, 'crumb') or len(self.session.cookies) == 0:
self.get_crumb()
now = datetime.utcnow()
dateto = int(now.timestamp())
datefrom = int((now - self.dt).timestamp())
url = self.quote_link.format(quote=self.symbol, dfrom=datefrom, dto=dateto, crumb=self.crumb)
response = self.session.get(url)
response.raise_for_status()
return pd.read_csv(StringIO(response.text), parse_dates=['Date'])
You can use it like this:
df = YahooFinanceHistory('AAPL', days_back=30).get_quote()
Related Topics
Sum of Two Columns of Data Frame with Na Values
Filter a Vector of Strings Based on String Matching
Convert Latitude and Longitude Coordinates to Country Name in R
Remove Part of a String in Dataframe Column (R)
Avoid Scientific Notation in Cut Function in R
Replacing Values in a Column with Another Column R
Explicitly Set Panel Size (Not Just Plot Size) in Ggplot2
Add Colored Arrow to Axis of Ggplot2 (Partially Outside Plot Region)
Twitter Data Analysis - Error in Term Document Matrix
R: Updating a Data Frame with Another Data Frame
Shiny - Observe() Triggered by Dynamicaly Generated Inputs
Subset Data Frame Using Row Names
Convert a Dataframe to an Object of Class "Dist" Without Actually Calculating Distances in R
How to Split an Igraph into Connected Subgraphs
How to Read Geojson or Topojson File in R to Draw a Choropleth Map