How to Scrape Website Content in PHP from a Website That Requires a Cookie Login

How can I scrape website content in PHP from a website that requires a cookie login?

Object-Oriented answer

We implement as much as possible of the previous answer in one class called Browser that should supply the normal navigation features.

Then we should be able to put the site-specific code, in very simple form, in a new derived class that we call, say, FooBrowser, that performs scraping of the site Foo.

The class deriving Browser must supply some site-specific function such as a path() function allowing to store site-specific information, for example

function path($basename) {
return '/var/tmp/www.foo.bar/' . $basename;
}

abstract class Browser
{
private $options = [];
private $state = [];
protected $cookies;

abstract protected function path($basename);

public function __construct($site, $options = []) {
$this->cookies = $this->path('cookies');
$this->options = array_merge(
[
'site' => $site,
'userAgent' => 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 - LeoScraper',
'waitTime' => 250000,
],
$options
);
$this->state = [
'referer' => '/',
'url' => '',
'curl' => '',
];
$this->__wakeup();
}

/**
* Reactivates after sleep (e.g. in session) or creation
*/
public function __wakeup() {
$this->state['curl'] = curl_init();
$this->config([
CURLOPT_USERAGENT => $this->options['userAgent'],
CURLOPT_ENCODING => '',
CURLOPT_NOBODY => false,
// ...retrieving the body...
CURLOPT_BINARYTRANSFER => true,
// ...as binary...
CURLOPT_RETURNTRANSFER => true,
// ...into $ret...
CURLOPT_FOLLOWLOCATION => true,
// ...following redirections...
CURLOPT_MAXREDIRS => 5,
// ...reasonably...
CURLOPT_COOKIEFILE => $this->cookies,
// Save these cookies
CURLOPT_COOKIEJAR => $this->cookies,
// (already set above)
CURLOPT_CONNECTTIMEOUT => 30,
// Seconds
CURLOPT_TIMEOUT => 300,
// Seconds
CURLOPT_LOW_SPEED_LIMIT => 16384,
// 16 Kb/s
CURLOPT_LOW_SPEED_TIME => 15,
]);
}

/**
* Imports an options array.
*
* @param array $opts
* @throws DetailedError
*/
private function config(array $opts = []) {
foreach ($opts as $key => $value) {
if (true !== curl_setopt($this->state['curl'], $key, $value)) {
throw new \Exception('Could not set cURL option');
}
}
}

private function perform($url) {
$this->state['referer'] = $this->state['url'];
$this->state['url'] = $url;
$this->config([
CURLOPT_URL => $this->options['site'] . $this->state['url'],
CURLOPT_REFERER => $this->options['site'] . $this->state['referer'],
]);
$response = curl_exec($this->state['curl']);
// Should we ever want to randomize waitTime, do so here.
usleep($this->options['waitTime']);

return $response;
}

/**
* Returns a configuration option.
* @param string $key configuration key name
* @param string $value value to set
* @return mixed
*/
protected function option($key, $value = '__DEFAULT__') {
$curr = $this->options[$key];
if ('__DEFAULT__' !== $value) {
$this->options[$key] = $value;
}
return $curr;
}

/**
* Performs a POST.
*
* @param $url
* @param $fields
* @return mixed
*/
public function post($url, array $fields) {
$this->config([
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query($fields),
]);
return $this->perform($url);
}

/**
* Performs a GET.
*
* @param $url
* @param array $fields
* @return mixed
*/
public function get($url, array $fields = []) {
$this->config([ CURLOPT_POST => false ]);
if (empty($fields)) {
$query = '';
} else {
$query = '?' . http_build_query($fields);
}
return $this->perform($url . $query);
}
}

Now to scrape FooSite:

/* WWW_FOO_COM requires username and password to construct */

class WWW_FOO_COM_Browser extends Browser
{
private $loggedIn = false;

public function __construct($username, $password) {
parent::__construct('http://www.foo.bar.baz', [
'username' => $username,
'password' => $password,
'waitTime' => 250000,
'userAgent' => 'FooScraper',
'cache' => true
]);
// Open the session
$this->get('/');
// Navigate to the login page
$this->get('/login.do');
}

/**
* Perform login.
*/
public function login() {
$response = $this->post(
'/ajax/loginPerform',
[
'j_un' => $this->option('username'),
'j_pw' => $this->option('password'),
]
);
// TODO: verify that response is OK.
// if (!strstr($response, "Welcome " . $this->option('username'))
// throw new \Exception("Bad username or password")
$this->loggedIn = true;
return true;
}

public function scrape($entry) {
// We could implement caching to avoid scraping the same entry
// too often. Save $data into path("entry-" . md5($entry))
// and verify the filemtime of said file, is it newer than time()
// minus, say, 86400 seconds? If yes, return file_get_content and
// leave remote site alone.
$data = $this->get(
'/foobars/baz.do',
[
'ticker' => $entry
]
);
return $data;
}

Now the actual scraping code would be:

    $scraper = new WWW_FOO_COM_Browser('lserni', 'mypassword');
if (!$scraper->login()) {
throw new \Exception("bad user or pass");
}
// www.foo.com is a ticker site, we need little info for each
// Other examples might be much more complex.
$entries = [
'APPL', 'MSFT', 'XKCD'
];
foreach ($entries as $entry) {
$html = $scraper->scrape($entry);
// Parse HTML
}

Mandatory notice: use a suitable parser to get data from raw HTML.

Scrape a site content With a Secure Login

you need to do a POST to http://aftabcurrency.com/login_script.php
your curl needs also to accept cookies.

After the authentification the script will redirect you, so you need also to add CURLOPT_FOLLOWACTION.

here is a edited version of your script, I can't test it on http://aftabcurrency.com/ hope it works:

$url = "http://aftabcurrency.com/login_script.php";

$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_URL, $url);
$cookie = 'cookies.txt';
$timeout = 30;

curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout );
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie);

curl_setopt ($ch, CURLOPT_POST, 1);
curl_setopt ($ch,CURLOPT_POSTFIELDS,"user_name=user&user_password=pass&passcode=code");

$result = curl_exec($ch);

/* //OPTIONAL - Redirect to another page after login
$url = "http://aftabcurrency.com/some_other_page";
curl_setopt ($ch, CURLOPT_POST, 0);
curl_setopt($ch, CURLOPT_URL, $url);
$result = curl_exec($ch);
*/ //end OPTIONAL

curl_close($ch);
echo $result;

How to scrape a website that uses session login using php

You should accept the session cookie, store it, and resend it on the next request.

Keeping session alive with Curl and PHP

Need to scrape contents of website that requires an i agree cookie to be set

Use server side script (PHP using cURL) to crawl the website and return the information you need. Make sure you set the appropriate HTTP header with your request that represents the "I agree" cookie.

Sample:

<?php

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, 'http://www.example.com/');
curl_setopt($ch, CURLOPT_COOKIE, 'I_Agree=1');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

$responseBody = curl_exec($ch);

curl_close($ch);

// Read the information you need from $responseBody and return it as response body

?>

Now you can access the information from your website by calling your server side script above. For details about how to use cURL take a look at the documentation.

Scraping a site that wants a cookie

Take a look at the various curl_setopt parameters for cookies.

You can use CURLOPT_COOKIE to manually set cookies, or use CURLOPT_COOKIEJAR and a file on disk to actually store and persist cookies across multiple requests.

However, you probably only need a session cookie, which the manual says are supported by default -- as long as you use the same curl instance for each request. If you're making a new curl instance for each request, those instances won't share cookies.

Scraping from a website that requires a login?

You will have to go through the required login transaction by sending POST data with your CURL requests. That said, it is a bad idea to scrape data from behind a login - the site didn't put that information in the public for a reason, and for you to do so might constitute copyright infringement,

How to login to website and extract data using PHP

You can use cURL to send post data and headers. To login you need to replicate the exact data exchange between the client and the server.

Check this answer for some examples:

How do I submit POST data using PHP and cURL?

Php : sending cookie in curl Request

scrape a website with secured login

Ok, I will share you this.. this is the class i been using for scraping. feel free to use it.

<?php

class Scrape
{
public $cookies = 'cookies.txt';
private $user = null;
private $pass = null;

/*Data generated from cURL*/
public $content = null;
public $response = null;

/* Links */
private $url = array(
'login' => 'https://www.wonatrading.com/account.php',
'submit' => 'https://www.wonatrading.com/login.php?action=process'
);

/* Fields */
public $data = array();

public function __construct ($user, $pass)
{

$this->user = $user;
$this->pass = $pass;

}

public function login()
{

$this->cURL($this->url['login']);

if($form = $this->getFormFields($this->content, 'login'))
{
$form['email_address'] = $this->user;
$form['password'] =$this->pass;
//echo "<pre>".print_r($form,true);exit;
$this->cURL($this->url['submit'], $form);
echo $this->content;exit;
}
echo $this->content;exit;
}

/* Scan for form */
private function getFormFields($data, $id)
{
if (preg_match('/(<form.*?name=.?'.$id.'.*?<\/form>)/is', $data, $matches)) {
$inputs = $this->getInputs($matches[1]);

return $inputs;
} else {
return false;
}

}

/* Get Inputs in form */
private function getInputs($form)
{
$inputs = array();

$elements = preg_match_all('/(<input[^>]+>)/is', $form, $matches);

if ($elements > 0) {
for($i = 0; $i < $elements; $i++) {
$el = preg_replace('/\s{2,}/', ' ', $matches[1][$i]);

if (preg_match('/name=(?:["\'])?([^"\'\s]*)/i', $el, $name)) {
$name = $name[1];
$value = '';

if (preg_match('/value=(?:["\'])?([^"\']*)/i', $el, $value)) {
$value = $value[1];
}

$inputs[$name] = $value;
}
}
}

return $inputs;
}

/* Perform curl function to specific URL provided */
public function cURL($url, $post = false)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_VERBOSE, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 120);
curl_setopt($ch, CURLOPT_TIMEOUT, 120);

if($post) //if post is needed
{
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($post));
}

curl_setopt($ch, CURLOPT_URL, $url);
$this->content = curl_exec($ch);
$this->response = curl_getinfo( $ch );
$this->url['last_url'] = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
}
}

$sc = new Scrape('benkrish.mk@gmail.com','18UlG');
$sc->login();

?>


Related Topics



Leave a reply



Submit