Scrape Multiple Urls Using Qwebpage

Scrape multiple urls using QWebPage

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.

Here are some examples which show how to use the WebPage class:

Usage:

def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())

PyQt5 WebPage:

from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)

def process(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True

def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()

def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)

PyQt4 WebPage:

from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)

def start(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True

def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

If statement using lapply to scrape multiple urls with different nodes/tags

I think the issue is you should not use if...else but only if since using the former would not check for next condition if the first condition is not satisfied. Here is an easy to read version which returns output for all 3 of the links shared.

library(rvest)

lapply(noticias_semana_lapply[12:14,1], function(x) {
new_x <- read_html(x) %>% html_nodes(".tittleArticuloOpinion") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".nameColumnista") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".article-header h2") %>% html_text
return(new_x)
})

Web Scraping Multiple Links with PyQt / QtWebkit

OK then. If you really need JavaScript. (Can you get the answer from JSON at all? That would probably be easier still with simplejson or json.) The answer is don't make more than one QApplication. You're not allowed to. Make main make a QApplication and then use the QWebPage without bothering to call QApplication.exec_(). If that doesn't work, run it all in another QThread.

Can't web scrape with PyQt5 more than once

You're initializing QApplication more than once. Only once instance should exist, globally. If you need to get the current instance and do not have a handle to it, you can use QApplication.instance(). QApplication.quit() is meant to be called right before sys.exit, in fact, you should almost never use one without the other.

In short, you're telling Qt you're exiting the application, and then trying to run more Qt code. It's an easy fix, however...

Solution

You can do 1 of three things:

Store the app in a global variable and reference it from there:

APP = QApplication(sys.argv)
# ... Many lines ellipsed

class SomeClass(QWidget):
def some_method(self):
APP.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)

Pass the app as a handle to the class.

def render(app, url):
...

Create a global instance, and use QApplication.instance().

APP = QApplication(sys.argv)
# ... Many lines ellipsed

class SomeClass(QWidget):
def some_method(self):
app = QApplication.instance()
app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)

Do what's most convenient for you.

Scrape multiple urls using QWebPage

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.

Here are some examples which show how to use the WebPage class:

Usage:

def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())

PyQt5 WebPage:

from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)

def process(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True

def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()

def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)

PyQt4 WebPage:

from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)

def start(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True

def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Why does my PyQt code not execute totally when multithreading?

QWebEngineView cannot and should not run on another thread.

Instead if you want to get html asynchronously then you should use the Qt signals:

from functools import partial
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets

class WebManager(QtCore.QObject):
def __init__(self, parent=None):
super(WebManager, self).__init__(parent)
self.pages = []
self.results = []

def load(self, url):
page = QtWebEngineWidgets.QWebEnginePage(self)
page.loadFinished.connect(self._on_load_finished)
self.pages.append(page)
page.load(QtCore.QUrl(url))

@QtCore.pyqtSlot(bool)
def _on_load_finished(self, ok):
page = self.sender()
if not isinstance(page, QtWebEngineWidgets.QWebEnginePage):
return
if ok:
wrapper = partial(self.callable, page)
page.toHtml(wrapper)
else:
self.pages.remove(page)
page.deleteLater()

def callable(self, page, html):
self.pages.remove(page)
url = page.requestedUrl().toString()
page.deleteLater()
self.results.append((url, html))
if not self.pages:
QtWidgets.QApplication.quit()

if __name__ == "__main__":
import sys

app = QtWidgets.QApplication(sys.argv)

manager = WebManager()

pages = []
format_url = "http://pyqt.sourceforge.net/Docs/PyQt5/%s.html"
for name in dir(QtWebEngineWidgets):
if name.startswith("Q"):
url = format_url % name.lower()
manager.load(url)
app.exec_()
for url, html in manager.results:
print(url)
print(html)

Scraping Web to Get its contents with PyQt5 and Beautiful Soup in Python

Obtaining the html in QtWebEngine is asynchronous, so you get None, instead you must pass "self._func" through functools.partial() to add the url:

from functools import partial
import signal
import sys

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit

from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage

class WebPage(QWebPage):
def __init__(self):
QWebPage.__init__(self)
self.loadFinished.connect(self.handleLoadFinished)

def process(self, items):
self._items = iter(items)
self.fetchNext()

def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.load(QUrl(self._url))
except StopIteration:
return False
return True

def handleLoadFinished(self):
wrapper = partial(self._func, self._url)
self.toHtml(wrapper)
if not self.fetchNext():
print("# processing complete")

def funcA(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(html, "html.parser")

def funcB(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)

items = [
("http://stackoverflow.com", funcA),
("http://google.com", funcB),
]

def main():

signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")
app = QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())

if __name__ == "__main__":
main()

Scrape multiple urls using QWebPage

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.

Here are some examples which show how to use the WebPage class:

Usage:

def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())

PyQt5 WebPage:

from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)

def process(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True

def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()

def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)

PyQt4 WebPage:

from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)

def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)

def start(self, urls):
self._urls = iter(urls)
self.fetchNext()

def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True

def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()

def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)


Related Topics



Leave a reply



Submit