# -*- coding: utf-8 -*- import re import csv import time try: from PySide.QtGui import QApplication from PySide.QtCore import QUrl, QEventLoop, QTimer from PySide.QtWebKit import QWebView except ImportError: from PyQt4.QtGui import QApplication from PyQt4.QtCore import QUrl, QEventLoop, QTimer from PyQt4.QtWebKit import QWebView import lxml.html class BrowserRender(QWebView): def __init__(self, display=True): self.app = QApplication([]) QWebView.__init__(self) if display: self.show() # show the browser def open(self, url, timeout=60): """Wait for download to complete and return result""" loop = QEventLoop() timer = QTimer() timer.setSingleShot(True) timer.timeout.connect(loop.quit) self.loadFinished.connect(loop.quit) self.load(QUrl(url)) timer.start(timeout * 1000) loop.exec_() # delay here until download finished if timer.isActive(): # downloaded successfully timer.stop() return self.html() else: # timed out print 'Request timed out:', url def html(self): """Shortcut to return the current HTML""" return self.page().mainFrame().toHtml() def find(self, pattern): """Find all elements that match the pattern""" return self.page().mainFrame().findAllElements(pattern) def attr(self, pattern, name, value): """Set attribute for matching elements""" for e in self.find(pattern): e.setAttribute(name, value) def text(self, pattern, value): """Set attribute for matching elements""" for e in self.find(pattern): e.setPlainText(value) def click(self, pattern): """Click matching elements""" for e in self.find(pattern): e.evaluateJavaScript("this.click()") def wait_load(self, pattern, timeout=60): """Wait for this pattern to be found in webpage and return matches""" deadline = time.time() + timeout while time.time() < deadline: self.app.processEvents() matches = self.find(pattern) if matches: return matches print 'Wait load timed out' def main(): br = BrowserRender() br.open('http://example.webscraping.com/search') br.attr('#search_term', 'value', '.') br.text('#page_size option:checked', '1000') br.click('#search') elements = br.wait_load('#results a') writer = csv.writer(open('countries.csv', 'w')) for country in [e.toPlainText().strip() for e in elements]: writer.writerow([country]) if __name__ == '__main__': main()
渲染类
最新推荐文章于 2022-11-25 22:08:40 发布