---------这个把下载的网页保存在一个文件里---------------------------------------------------
# coding: utf8
import sys
import signal
from optparse import OptionParser
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
class Crawler( QWebPage ):
def __init__(self, url, file):
QWebPage.__init__( self )
self._url = url
self._file = file
#QFont.setPixelSize(QFont(12))
#self.QWebView = BrowserScreen()
#self.setView(self.QWebView)
#QWebSettings.globalSettings().setAttribute(QWebSettings.AutoLoadImages, False);
#QWebSettings.globalSettings().setAttribute(QWebSettings.JavascriptEnabled, True);
#QWebSettings.globalSettings().setAttribute(QWebSettings.DnsPrefetchEnabled, True);
#QWebSettings.globalSettings().setDefaultTextEncoding("utf-8");
def crawl( self ):
signal.signal( signal.SIGINT, signal.SIG_DFL )
self.connect( self, SIGNAL( 'loadFinished(bool)' ), self._finished_loading )
'''
file = open("/tmp/b.txt", 'r' )
try:
html = file.read( )
finally:
file.close( )
html = unicode(html, 'utf-8', 'ignore')
self.mainFrame().setHtml(html, QUrl( self._url ) )
'''
self.mainFrame().load( QUrl( self._url ) )
def _finished_loading( self, result ):
file = open( self._file, 'w' )
#print self.mainFrame().toHtml()
file.write( self.mainFrame().toHtml() )
file.close()
sys.exit( 0 )
def main():
app = QApplication( sys.argv )
options = get_cmd_options()
crawler = Crawler( options.url, options.file )
crawler.crawl()
sys.exit( app.exec_() )
def get_cmd_options():
"""
gets and validates the input from the command line
"""
usage = "usage: %prog [options] args"
parser = OptionParser(usage)
parser.add_option('-u', '--url', dest = 'url', help = 'URL to fetch data from')
parser.add_option('-f', '--file', dest = 'file', help = 'Local file path to save data to')
(options, args) = parser.parse_args()
if not options.url:
print 'You must specify an URL.',sys.argv[0],'--help for more details'
exit(1)
if not options.file:
print 'You must specify a destination file.',sys.argv[0],'--help for more details'
exit(1)
return options
if __name__ == '__main__':
main()
---------这个网页加载完成后可以输出---------------------------------------------------
# coding: utf8
import sys
import os
from time import time
import code
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Pekit():
def __init__(self):
self.user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2) Gecko/20100124 Firefox/3.6 (Swiftfox)'
self.application = QApplication([])
self.webpage = QWebPage()
self.webpage.userAgentForUrl = lambda url: self.user_agent
self.webframe = self.webpage.currentFrame()
self.webpage.javaScriptAlert = self._alert
self.webpage.javaScriptConfirm = lambda frame, message: raw_input('==Confirm== (y/n): %s' % message) == 'y'
self.webpage.javaScriptConsoleMessage = lambda string1, int1, string2: sys.stdout.write('==Message==: %s %d %s\n' % (string1, int1, string2))
self.webpage.javaScriptPrompt = lambda frame, message, defaultValue, result: raw_input('==Prompt== %s:' % message) or False
self._load_status = 'init'
self.webpage.connect(self.webpage,
SIGNAL('loadFinished(bool)'),
self._onLoadFinished)
self.webpage.connect(self.webpage,
SIGNAL('loadStarted()'),
self._onLoadStarted)
def load(self, url):
self.webframe.load(QUrl(url))
self.wait_load()
jsdir = os.path.dirname(__file__)
jscode = open('%s/jquery.min.js' % jsdir, 'r').read()
jscode += open('%s/jquery.simulate.js' % jsdir, 'r').read()
jscode += "var _JQ = jQuery.noConflict();"
self.js_noload(jscode)
def js(self, js):
self._load_status = 'start'
self.webframe.evaluateJavaScript(js)
def js_noload(self, js):
self.webframe.evaluateJavaScript(js)
def html(self):
return unicode(self.webframe.toHtml())
def wait_load(self, least = 0, most = 300):
start = time()
while (self._load_status == 'start' and time() - start < most) or time() - start < least:
self.application.processEvents()
def _onLoadFinished(self, status):
if status:
self._load_status = 'end'
def _onLoadStarted(self):
self._load_status = 'start'
def _alert(self, frame, message):
print "==Alert== %s" % unicode(message)
参考资料: