python多线程爬虫测试页面

最新推荐文章于 2022-12-22 12:45:10 发布

「已注销」

最新推荐文章于 2022-12-22 12:45:10 发布

阅读量294

点赞数

分类专栏： python 文章标签： python 多线程爬虫测试实例

本文链接：https://blog.csdn.net/ghostyusheng/article/details/81457671

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import threading
import requests
import os

from random import shuffle
from time import sleep
from urlparse import urlparse
from lxml import etree

domain              = "https://www.xxx.com"
piece_count         = 3
wait_crawl_urls     = [domain]
done_urls           = []
error_urls          = []

def main():
    process_crawl()    


def process_crawl():
    for i in range(0, len(wait_crawl_urls) + piece_count, piece_count):
        sub_thread = threading.Thread(target=handle_divide_urls, args=(wait_crawl_urls[i:i+piece_count],))
        sub_thread.start()

                
def handle_divide_urls(piece_crawl_urls):
    temp_piece_crawl_urls = []

    for url in piece_crawl_urls:
        if url in done_urls:
            continue

        response_text = get_url_response(url)

        if None is response_text:
            continue

        dom_tree      = etree.HTML(response_text)
        urls_list     = dom_tree.xpath("//a/@href")

        for url in urls_list:
            parse_info          = urlparse(url)
            parse_path          = parse_info.path
            parse_domain        = parse_info.netloc

            if (parse_domain == '' and parse_path and parse_path[0] == '/') \
                    or (parse_domain == 'www.xxx.com'):
                wait_crawl_url = domain + parse_path

                if wait_crawl_url not in wait_crawl_urls:
                    temp_piece_crawl_urls.append(wait_crawl_url)

    if temp_piece_crawl_urls:
        handle_divide_urls(temp_piece_crawl_urls)

    return


def get_url_response(url):
    if filter(url) == False:
        return None

    sleep(1)
    response = requests.get(url)
    code     = response.status_code

    if code != 200:
        error_urls.append(url)
        print ('Error %s, code : %d' % (url, code))
        return None

    print('url : %s, code : %d' % (url, code))

    if url not in done_urls:
        done_urls.append(url)

    if u'错误代码' in response.text:
        error_message = "Error: YII 页面错误发现 url : %s" % url
        print(error_message)
        #os.system(r'curl -X POST ...'.format(error_message)) 

    return response.text
    

def filter(url):
    ext = url.split('.')[-1]
    if (ext == 'com') or ('/' in ext):
        return True 

    return False 


if __name__ == '__main__':
    main()