但我想知道如何同时抓取页面(多线程)。在
我写了下面的代码,似乎函数do\u crawl不能被调用pool.apply_异步. 如果使用池.应用,将调用do\u crawl。在import gtk
import webkit
from multiprocessing.pool import ThreadPool
class WebView(webkit.WebView):
def get_html(self):
self.execute_script('oldtitle=document.title;document.title=document.documentElement.innerHTML;')
html = self.get_main_frame().get_title()
self.execute_script('document.title=oldtitle;')
return html
class Crawler(gtk.Window):
def __init__(self, url):
gtk.gdk.threads_init() # suggested by Nicholas Herriot for Ubuntu Koala
gtk.Window.__init__(self)
self._url = url
def crawl(self):
view = WebView()
view.open(self._url)
view.connect('load-finished', self._finished_loading)
self.add(view)
gtk.main()
def _finished_loading(self, view, frame):
view.get_html()
gtk.main_quit()
def main():
pool = ThreadPool(10)
[pool.apply_async(do_crawl, ('http://google.com/')) for i in range(100)]
pool.join()
def do_crawl(url):
crawler = Crawler(url)
crawler.crawl()
if __name__ == '__main__':
main(
)