threaded_crawler.py # -*- coding: utf-8 -*- import time import threading from downloader import Downloader import urlparse import robotparser import csv import re import lxml SLEEP_TIME = 1 DEFAULT_AGENT = 'wswp' DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 60 def threaded_crawler(seed_url,cache=None, delay=DEFAULT_DELAY, user_agent='wswp', proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, sleep_time=SLEEP_TIME,max_threads=10, scrape_callback=None): crawl_queue = [seed_url] seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback:
多线程爬虫
最新推荐文章于 2021-09-14 20:30:14 发布