R: Web Scraping Yahoo.Finance After 2019 Change

R: web scraping yahoo.finance after 2019 change

As mentioned in the comment above, here is an alternative that tries to deal with the different table sizes published. I have worked on this and have had help from a friend.

library(rvest)
library(tidyverse)

url <- https://finance.yahoo.com/quote/AAPL/financials?p=AAPL

# Download the data
raw_table <- read_html(url) %>% html_nodes("div.D\\(tbr\\)")

number_of_columns <- raw_table[1] %>% html_nodes("span") %>% length()

if(number_of_columns > 1){
  # Create empty data frame with the required dimentions
  df <- data.frame(matrix(ncol = number_of_columns, nrow = length(raw_table)),
                      stringsAsFactors = F)

  # Fill the table looping through rows
  for (i in 1:length(raw_table)) {
    # Find the row name and set it.
    df[i, 1] <- raw_table[i] %>% html_nodes("div.Ta\\(start\\)") %>% html_text()
    # Now grab the values
    row_values <- raw_table[i] %>% html_nodes("div.Ta\\(end\\)")
    for (j in 1:(number_of_columns - 1)) {
      df[i, j+1] <- row_values[j] %>% html_text()
    }
  }
view(df)

How can I extract this specific table from this web page using R?

[QHarr] has answered the same question in this post I've copied the relevant code from their answer.

library(rvest)
library(stringr)
library(magrittr)

page <- read_html("https://finance.yahoo.com/quote/SKLZ/cash-flow?p=SKLZ")
nodes <- page %>%html_nodes(".fi-row")
df = NULL

for(i in nodes){
  r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
  df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}

df

Adjusting Yahoo Stock Data Web Scraping to Loop over Dates

Consider mapply or its non-simplified wrapper, Map, to iterate elementwise through the pairings of start and end dates and corresponding symbols. Also, avoid the use of assign and get and build list of data frames for final rbind at the end:

library(XML)
...
dateGroup <- data.frame(
    start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
    end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)

# CROSS JOIN ALL SYMBOLS WITH EACH DATE PAIRING
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# DEFINED METHOD FOR HTML PROCESSING
proc_html <- function(sym, sd, ed) {    
    url <- paste0('https://finance.yahoo.com/quote/', sym, '/history?period1=',
                  sd, '&period2=', ed, '&interval=1d&filter=history&frequency=1d')
    print(url)

    webpage <- readLines(url, warn=FALSE)
    html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
    tableNodes <- getNodeSet(html, "//table")

    html_df <- transform(readHTMLTable(tableNodes[[1]],
                                       header=c("Date", "Open", "High", "Low",
                                                "Close", "Adj. Close", "Volume")),
                         symbol = sym)
    return(html_df)
}

# ITERATE ELEMENTWISE THROUGH EVERY ROW of dt_grp_sym
df_list <- Map(proc_html, dt_grp_sym$symbols, dt_grp_sym$start, dt_grp_sym$end)

final_df <- do.call(rbind, df_list)

To demonstrate using the Class 1 U.S. railroads:

symbols <- c("UNP", "CSX", "NSC", "CNI", "KSU")

dateGroup <- data.frame(
  start = c(1509519600, 1518159600, 1526799600, 1535439600, 1544079600),
  end = c(1518073200, 1526713200, 1535353200, 1543993200, 1550732400)
)    
dt_grp_sym <- merge(dateGroup, data.frame(symbols))

# CALLING SAME ABOVE FUNCTION
df_list <- with(dt_grp_sym, Map(proc_html, symbols, start, end))   
final_df <- do.call(rbind, df_list)

Output

by(final_df, final_df$symbol, head)

# final_df$symbol: CNI
#              Date  Open  High   Low Close Adj..Close    Volume symbol
# 998  Feb 08, 2018 76.08 76.16 74.11 74.45      72.79 1,508,100    CNI
# 999  Feb 07, 2018 76.86 77.23 76.01 76.17      74.48 1,645,400    CNI
# 1000 Feb 06, 2018 76.21 77.42 74.81 77.14      75.42 2,293,300    CNI
# 1001 Feb 05, 2018 78.00 78.70 77.12 77.17      75.45 1,711,000    CNI
# 1002 Feb 02, 2018 79.17 79.24 78.17 78.46      76.71 1,331,400    CNI
# 1003 Feb 01, 2018 79.91 80.54 79.24 79.82      78.04 1,231,500    CNI
# ------------------------------------------------------------------------------ 
# final_df$symbol: CSX
#             Date  Open  High   Low Close Adj..Close     Volume symbol
# 333 Feb 08, 2018 52.91 53.16 50.46 50.47      49.80  7,798,100    CSX
# 334 Feb 07, 2018 53.38 54.36 52.94 52.97      52.26  6,496,200    CSX
# 335 Feb 06, 2018 51.27 54.00 50.12 53.82      53.10 10,563,700    CSX
# 336 Feb 05, 2018 54.89 55.04 51.96 51.99      51.30  9,070,200    CSX
# 337 Feb 02, 2018 56.19 56.35 55.20 55.25      54.51  9,275,800    CSX
# 338 Feb 01, 2018 56.10 57.10 56.04 56.58      55.83  4,079,100    CSX
# ------------------------------------------------------------------------------ 
# final_df$symbol: KSU
#              Date   Open   High    Low  Close Adj..Close    Volume symbol
# 1330 Feb 08, 2018 107.17 107.64 103.50 103.53     102.15 1,434,600    KSU
# 1331 Feb 07, 2018 106.59 108.27 106.59 107.10     105.67 1,326,800    KSU
# 1332 Feb 06, 2018 103.11 108.02 102.07 107.32     105.89 1,459,400    KSU
# 1333 Feb 05, 2018 109.73 110.44 105.12 105.18     103.77 1,272,100    KSU
# 1334 Feb 02, 2018 112.06 112.85 110.03 110.15     108.68 1,051,900    KSU
# 1335 Feb 01, 2018 112.80 114.00 112.17 112.87     111.36 1,011,200    KSU
# ------------------------------------------------------------------------------ 
# final_df$symbol: NSC
#             Date   Open   High    Low  Close Adj..Close    Volume symbol
# 665 Feb 08, 2018 142.62 143.27 136.87 136.89     134.22 2,657,200    NSC
# 666 Feb 07, 2018 142.09 144.45 141.37 142.68     139.89 1,464,500    NSC
# 667 Feb 06, 2018 136.99 143.45 134.55 143.05     140.26 2,455,000    NSC
# 668 Feb 05, 2018 144.74 146.73 138.18 138.61     135.90 2,508,900    NSC
# 669 Feb 02, 2018 147.15 147.85 144.61 145.03     142.20 1,774,600    NSC
# 670 Feb 01, 2018 149.28 150.35 147.90 148.47     145.57 1,427,000    NSC
# ------------------------------------------------------------------------------ 
# final_df$symbol: UNP
#           Date   Open   High    Low  Close Adj..Close     Volume symbol
# 1 Feb 08, 2018 128.70 128.70 124.81 124.86     122.27  6,325,100    UNP
# 2 Feb 07, 2018 130.34 131.82 128.94 128.96     126.29  5,053,000    UNP
# 3 Feb 06, 2018 122.28 131.50 121.50 131.15     128.43 15,734,300    UNP
# 4 Feb 05, 2018 128.59 131.78 124.13 124.14     121.57  6,744,400    UNP
# 5 Feb 02, 2018 131.66 131.73 127.22 129.36     126.68  8,181,200    UNP
# 6 Feb 01, 2018 132.51 133.74 131.86 132.38     129.64  5,597,600    UNP

How to pull financial statements from Yahoo Finance (2019 URL) in VBA

the code works just fine for MSFT in so far as it works the same way for that ticker as it does for SBUX. The code you linked to is for retrieving balance sheet info for a given ticker.

https://finance.yahoo.com/quote/SBUX/balance-sheet?p=SBUX

https://finance.yahoo.com/quote/MSFT/balance-sheet?p=MSFT

This does not guarantee you can 'lift and shift' this code for use with any of the other tabs e.g. income statement which has the following construction:

https://finance.yahoo.com/quote/MSFT/financials?p=MSFT

You will need to inspect the html of these tabs and see how it differs. There are already existing answers on StackOverflow covering how to obtain the data as shown in the other tabs (and by the different time periods e.g. Quarter).

VBA translation of existing answer. In VBA it would benefit from re-factoring:

Option Explicit

Public Sub WriteOutFinancialInfo()
    Dim http As Object, s As String

    Set http = CreateObject("MSXML2.XMLHTTP")

    With http
        .Open "GET", "https://finance.yahoo.com/quote/MSFT/financials?p=MSFT", False
        .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
        .send
        s = .responseText
    End With
    
    Dim html As MSHTML.HTMLDocument, html2 As MSHTML.HTMLDocument, re As Object, matches As Object
    
    Set html = New MSHTML.HTMLDocument: Set html2 = New MSHTML.HTMLDocument
    Set re = CreateObject("VBScript.RegExp")
    
    html.body.innerHTML = s
    
    Dim headers(), rows As Object
    
    headers = Array("Breakdown", "TTM")
    Set rows = html.querySelectorAll(".fi-row")
    
    With re
        .Global = True
        .MultiLine = True
        .Pattern = "\d{1,2}/\d{1,2}/\d{4}"
        Set matches = .Execute(s)
    End With
    
    Dim results(), match As Object, r As Long, c As Long, startHeaderCount As Long
    startHeaderCount = UBound(headers)
    ReDim Preserve headers(0 To matches.Count + startHeaderCount)

    c = 1
    For Each match In matches
        headers(startHeaderCount + c) = match
        c = c + 1
    Next
    
    Dim row As Object
    ReDim results(1 To rows.Length, 1 To UBound(headers) + 1)
 
    For r = 0 To rows.Length - 1
        html2.body.innerHTML = rows.Item(r).outerHTML
        Set row = html2.querySelectorAll("[title],[data-test=fin-col]")
        
        For c = 0 To row.Length - 1
            results(r + 1, c + 1) = row.Item(c).innerText
        Next c
    Next
    
    Dim ws As Worksheet
    
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    
    With ws
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Project references:

VBE > Tools > References > Add reference to Microsoft HTML Object Library

Web Scraping information from Yahoo Finance

Copy the below JSON; instead of scraping just use the JSON array to access it in your code. You don't need to scrape continuously as this is old data that is not going to change; therefor the best option is store it and access it anytime.

Sample JSON object:

 {
   "Date": "12/28/2017",
   "Open": 171,
   "High": 171.850006,
   "Low": 170.479996,
   "Close": 171.080002,
   "Adj Close": 168.549545,
   "Volume": 16480200
 }

I put the entire JSON up for you here:
http://ilanpatao.com/misc/aapl.json

Web scraping from Yahoo Finance: cannot access individual tabs within a web page

That data is already present. It is simply loaded from a script tag when you click. You can regex out the appropriate string and parse with json library. You will need to study the json to determine which access paths to use. The data is within

data['context']['dispatcher']['stores']['QuoteSummaryStore']

Examples below:

import requests, re, json

p = re.compile(r'root\.App\.main = (.*);')
r = requests.get('https://finance.yahoo.com/quote/GOOG/financials?p=GOOG&.tsrc=fin-srch&guccounter=1')
data = json.loads(p.findall(r.text)[0])
quote_store = data['context']['dispatcher']['stores']['QuoteSummaryStore']
print(quote_store['earnings']['financialsChart']['quarterly'])
print(quote_store['incomeStatementHistoryQuarterly']['incomeStatementHistory'])

You can quick view the sections:

Sample Image

How to scraping table using html_table in R if there is no table tag?

The discussion at "[https://stackoverflow.com/questions/58315274/r-web-scraping-yahoo-finance-after-2019-change][1]" addresses your issue. Based on the discussion in the link, you can obtain the information as follows for "AAPL":

library(rvest)
library(tidyverse)

tic <- "AAPL"
link <- "https://finance.yahoo.com/quote/"
link <- paste0(link, tic, "/financials?p=", tic)
wahis.session <- html_session(link)
p <- wahis.session 
nodes <- p %>% html_nodes(".fi-row")

df = NULL

for(i in nodes){
  r <- list(i %>%html_nodes("[title],[data-test='fin-col']")%>%html_text())
  df <- rbind(df,as.data.frame(matrix(r[[1]], ncol = length(r[[1]]), byrow = TRUE), stringsAsFactors = FALSE))
}

matches <- str_match_all(p1%>%html_node('#Col1-1-Financials-Proxy')%>%html_text(),'\\d{1,2}/\\d{1,2}/\\d{4}')   
headers <- c('Breakdown','TTM', matches[[1]][,1]) 
names(df) <- headers

Scraping historical data from Yahoo Finance with Python

I wrote this to get historical data from YF directly from the download csv link. It needs to make two requests, one to get the cookie and the crumb and another one to get the data. It returns a pandas dataframe

import re
from io import StringIO
from datetime import datetime, timedelta

import requests
import pandas as pd

class YahooFinanceHistory:
    timeout = 2
    crumb_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
    crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
    quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{quote}?period1={dfrom}&period2={dto}&interval=1d&events=history&crumb={crumb}'

    def __init__(self, symbol, days_back=7):
        self.symbol = symbol
        self.session = requests.Session()
        self.dt = timedelta(days=days_back)

    def get_crumb(self):
        response = self.session.get(self.crumb_link.format(self.symbol), timeout=self.timeout)
        response.raise_for_status()
        match = re.search(self.crumble_regex, response.text)
        if not match:
            raise ValueError('Could not get crumb from Yahoo Finance')
        else:
            self.crumb = match.group(1)

    def get_quote(self):
        if not hasattr(self, 'crumb') or len(self.session.cookies) == 0:
            self.get_crumb()
        now = datetime.utcnow()
        dateto = int(now.timestamp())
        datefrom = int((now - self.dt).timestamp())
        url = self.quote_link.format(quote=self.symbol, dfrom=datefrom, dto=dateto, crumb=self.crumb)
        response = self.session.get(url)
        response.raise_for_status()
        return pd.read_csv(StringIO(response.text), parse_dates=['Date'])

You can use it like this:

df = YahooFinanceHistory('AAPL', days_back=30).get_quote()

R: Web Scraping Yahoo.Finance After 2019 Change