Scrape multiple urls using QWebPage
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished
signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
If statement using lapply to scrape multiple urls with different nodes/tags
I think the issue is you should not use if...else
but only if
since using the former would not check for next condition if the first condition is not satisfied. Here is an easy to read version which returns output for all 3 of the links shared.
library(rvest)
lapply(noticias_semana_lapply[12:14,1], function(x) {
new_x <- read_html(x) %>% html_nodes(".tittleArticuloOpinion") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".nameColumnista") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".article-header h2") %>% html_text
return(new_x)
})
Web Scraping Multiple Links with PyQt / QtWebkit
OK then. If you really need JavaScript. (Can you get the answer from JSON at all? That would probably be easier still with simplejson
or json
.) The answer is don't make more than one QApplication. You're not allowed to. Make main
make a QApplication
and then use the QWebPage
without bothering to call QApplication.exec_()
. If that doesn't work, run it all in another QThread
.
Can't web scrape with PyQt5 more than once
You're initializing QApplication more than once. Only once instance should exist, globally. If you need to get the current instance and do not have a handle to it, you can use QApplication.instance()
. QApplication.quit()
is meant to be called right before sys.exit
, in fact, you should almost never use one without the other.
In short, you're telling Qt you're exiting the application, and then trying to run more Qt code. It's an easy fix, however...
Solution
You can do 1 of three things:
Store the app in a global variable and reference it from there:
APP = QApplication(sys.argv)
# ... Many lines ellipsed
class SomeClass(QWidget):
def some_method(self):
APP.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
Pass the app
as a handle to the class.
def render(app, url):
...
Create a global instance, and use QApplication.instance()
.
APP = QApplication(sys.argv)
# ... Many lines ellipsed
class SomeClass(QWidget):
def some_method(self):
app = QApplication.instance()
app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
Do what's most convenient for you.
Scrape multiple urls using QWebPage
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished
signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
Why does my PyQt code not execute totally when multithreading?
QWebEngineView cannot and should not run on another thread.
Instead if you want to get html asynchronously then you should use the Qt signals:
from functools import partial
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebManager(QtCore.QObject):
def __init__(self, parent=None):
super(WebManager, self).__init__(parent)
self.pages = []
self.results = []
def load(self, url):
page = QtWebEngineWidgets.QWebEnginePage(self)
page.loadFinished.connect(self._on_load_finished)
self.pages.append(page)
page.load(QtCore.QUrl(url))
@QtCore.pyqtSlot(bool)
def _on_load_finished(self, ok):
page = self.sender()
if not isinstance(page, QtWebEngineWidgets.QWebEnginePage):
return
if ok:
wrapper = partial(self.callable, page)
page.toHtml(wrapper)
else:
self.pages.remove(page)
page.deleteLater()
def callable(self, page, html):
self.pages.remove(page)
url = page.requestedUrl().toString()
page.deleteLater()
self.results.append((url, html))
if not self.pages:
QtWidgets.QApplication.quit()
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
manager = WebManager()
pages = []
format_url = "http://pyqt.sourceforge.net/Docs/PyQt5/%s.html"
for name in dir(QtWebEngineWidgets):
if name.startswith("Q"):
url = format_url % name.lower()
manager.load(url)
app.exec_()
for url, html in manager.results:
print(url)
print(html)
Scraping Web to Get its contents with PyQt5 and Beautiful Soup in Python
Obtaining the html in QtWebEngine is asynchronous, so you get None, instead you must pass "self._func" through functools.partial()
to add the url:
from functools import partial
import signal
import sys
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage
class WebPage(QWebPage):
def __init__(self):
QWebPage.__init__(self)
self.loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.load(QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
wrapper = partial(self._func, self._url)
self.toHtml(wrapper)
if not self.fetchNext():
print("# processing complete")
def funcA(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(html, "html.parser")
def funcB(url, html):
print("# processing:", url)
print("html:", html)
soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
items = [
("http://stackoverflow.com", funcA),
("http://google.com", funcB),
]
def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")
app = QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())
if __name__ == "__main__":
main()
Scrape multiple urls using QWebPage
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished
signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
Related Topics
Getting the Class Name of an Instance
How to Upgrade All Python Packages with Pip
How to Bind Self Events in Tkinter Text Widget After It Will Binded by Text Widget
"Python" Not Recognized as a Command
Method Resolution Order (Mro) in New-Style Classes
Good Python Modules for Fuzzy String Comparison
Efficient String Matching in Apache Spark
Python Append() VS. + Operator on Lists, Why Do These Give Different Results
Create a Directly-Executable Cross-Platform Gui App Using Python
Comprehensive Beginner's Virtualenv Tutorial
Typeerror: Can't Convert 'Int' Object to Str Implicitly
Importerror: Dll Load Failed: %1 Is Not a Valid Win32 Application. But the Dll's Are There
Unicodedecodeerror: 'Utf8' Codec Can't Decode Byte 0Xa5 in Position 0: Invalid Start Byte
Differencebetween an Expression and a Statement in Python
How to .Decode('String-Escape') in Python 3
How to Find the Exact Intersection of a Curve (As Np.Array) with Y==0