How to scrape a website that requires login first with Python
This works for me:
Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
# View available forms
for f in br.forms():
print f
# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)
# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'
# Login
br.submit()
print(br.open('https://github.com/settings/emails').read())
You were not far off at all!
Scraping website with Beautiful Soup that requires login
go to login page, put your user name and password , press F12 and record from Network tab
then click on login then copy curl as per the below images, then search for curl to python converter and get the code as per second image, also the code will be attached for you as example
1-
2-
and the code will be like this
import requests
cookies = {
'__utmt_8254f77d54ec9886070127029a0b81da': '1',
'_fbp': 'fb.1.1610535613017.434450469',
'__utmt': '1',
'_ga': 'GA1.2.1008639424.1610535613',
'_gid': 'GA1.2.56271763.1610535614',
'__utma': '28029352.1008639424.1610535613.1610535864.1610535864.1',
'__utmc': '28029352',
'__utmz': '28029352.1610535864.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none)',
'__utmb': '28029352.1.10.1610535864',
'sat_ppv': '84',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://www.eurekalert.org',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://www.eurekalert.org/login.php',
'Accept-Language': 'en-US,en;q=0.9',
}
data = {
'frompage': '^',
'username': 'Username',
'password': 'Password'
}
def loginToPage():
# Perform login
response = requests.session().post('https://www.eurekalert.org/login.php', headers=headers, cookies=cookies, data=data)
if response.ok:
print(' logged in successfully')
return True
else:
print('failed to log in')
return False
How to scrape a website which requires login using python and beautifulsoup?
You can use mechanize:
import mechanize
from bs4 import BeautifulSoup
import urllib2
import cookielib ## http.cookiejar in python3
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open("https://id.arduino.cc/auth/login/")
br.select_form(nr=0)
br.form['username'] = 'username'
br.form['password'] = 'password.'
br.submit()
print br.response().read()
Or urllib - Login to website using urllib2
WEB SCRAPING behind LOGIN(Authentication) in Python
Try this code
from bs4 import BeautifulSoup
import requests
login = 'USERNAME'
password = 'PASSWORD'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
data = {'login': login,
'password': password, 'js-webauthn-support': 'supported', 'js-webauthn-iuvpaa-support': 'unsupported',
'commit': 'Sign in'}
with requests.session() as sess:
post_data = sess.get('https://github.com/login')
html = BeautifulSoup(post_data.text, 'html.parser')
#Update data
data.update(timestamp_secret = html.find("input", {'name':'timestamp_secret'}).get('value'))
data.update(authenticity_token= html.find("input", {'name':'authenticity_token'}).get('value'))
data.update(timestamp = html.find("input", {'name':'timestamp'}).get('value'))
#Login
res = sess.post("https://github.com/session", data=data, headers=headers)
#Check login
res = sess.get('https://github.com/')
try:
username = BeautifulSoup(res.text, 'html.parser').find('meta', {'name': 'user-login'}).get('content')
except:
print ('Your username or password is incorrect')
else:
print ("You have successfully logged in as", username)
Related Topics
Ruby Error Reading in Certificate File with Openssl
Ruby on Rails - Activerecord::Relation Count Method Is Wrong
Ruby -- Capitalize First Letter of Every Sentence in a Paragraph
Get All Keys in Hash with Same Value
Rails S Return: [Bug] Segmentation Fault
Rails - X-Sendfile + Temporary Files
When to Use Association Extensions VS Named Scopes
Check If Folder Exist in S3 Bucket
What's Wrong with the Square and Rectangle Inheritance
Rotate Bits Right Operation in Ruby
Routing Error No Route Matches [Get] "/Static_Pages/Home", Tutorial
Shortening Socket Timeout Using Timeout::Timeout(N) Does Not Seem to Work for Me
How to Replace the Characters in a String
Fresh Install of Rails and Getting Openssl Errors: "Already Initialized Constant Openssl"