Scraping a Site That Requires Login Username and Password on Two Separate Pages

How to scrape a website that requires login first with Python

This works for me:

Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text

# Browser
br = mechanize.Browser()

# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

br.addheaders = [('User-agent', 'Chrome')]

# The site we will navigate into, handling it's session
br.open('https://github.com/login')

# View available forms
for f in br.forms():
print f

# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)

# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'

# Login
br.submit()

print(br.open('https://github.com/settings/emails').read())

You were not far off at all!

Scraping website with Beautiful Soup that requires login

go to login page, put your user name and password , press F12 and record from Network tab

then click on login then copy curl as per the below images, then search for curl to python converter and get the code as per second image, also the code will be attached for you as example

1-
Sample Image

2-
Sample Image

and the code will be like this

    import requests

cookies = {
'__utmt_8254f77d54ec9886070127029a0b81da': '1',
'_fbp': 'fb.1.1610535613017.434450469',
'__utmt': '1',
'_ga': 'GA1.2.1008639424.1610535613',
'_gid': 'GA1.2.56271763.1610535614',
'__utma': '28029352.1008639424.1610535613.1610535864.1610535864.1',
'__utmc': '28029352',
'__utmz': '28029352.1610535864.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none)',
'__utmb': '28029352.1.10.1610535864',
'sat_ppv': '84',
}

headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://www.eurekalert.org',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://www.eurekalert.org/login.php',
'Accept-Language': 'en-US,en;q=0.9',
}

data = {
'frompage': '^',
'username': 'Username',
'password': 'Password'
}

def loginToPage():

# Perform login
response = requests.session().post('https://www.eurekalert.org/login.php', headers=headers, cookies=cookies, data=data)

if response.ok:
print(' logged in successfully')
return True

else:
print('failed to log in')
return False

How to scrape a website which requires login using python and beautifulsoup?

You can use mechanize:

import mechanize
from bs4 import BeautifulSoup
import urllib2
import cookielib ## http.cookiejar in python3

cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open("https://id.arduino.cc/auth/login/")

br.select_form(nr=0)
br.form['username'] = 'username'
br.form['password'] = 'password.'
br.submit()

print br.response().read()

Or urllib - Login to website using urllib2

WEB SCRAPING behind LOGIN(Authentication) in Python

Try this code

from bs4 import BeautifulSoup
import requests

login = 'USERNAME'
password = 'PASSWORD'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

data = {'login': login,
'password': password, 'js-webauthn-support': 'supported', 'js-webauthn-iuvpaa-support': 'unsupported',
'commit': 'Sign in'}

with requests.session() as sess:
post_data = sess.get('https://github.com/login')
html = BeautifulSoup(post_data.text, 'html.parser')

#Update data
data.update(timestamp_secret = html.find("input", {'name':'timestamp_secret'}).get('value'))
data.update(authenticity_token= html.find("input", {'name':'authenticity_token'}).get('value'))
data.update(timestamp = html.find("input", {'name':'timestamp'}).get('value'))
#Login
res = sess.post("https://github.com/session", data=data, headers=headers)

#Check login
res = sess.get('https://github.com/')
try:
username = BeautifulSoup(res.text, 'html.parser').find('meta', {'name': 'user-login'}).get('content')
except:
print ('Your username or password is incorrect')
else:
print ("You have successfully logged in as", username)


Related Topics



Leave a reply



Submit