python多线程爬虫测试页面

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import threading
import requests
import os

from random import shuffle
from time import sleep
from urlparse import urlparse
from lxml import etree

domain              = "https://www.xxx.com"
piece_count         = 3
wait_crawl_urls     = [domain]
done_urls           = []
error_urls          = []

def main():
    process_crawl()    


def process_crawl():
    for i in range(0, len(wait_crawl_urls) + piece_count, piece_count):
        sub_thread = threading.Thread(target=handle_divide_urls, args=(wait_crawl_urls[i:i+piece_count],))
        sub_thread.start()

                
def handle_divide_urls(piece_crawl_urls):
    temp_piece_crawl_urls = []

    for url in piece_crawl_urls:
        if url in done_urls:
            continue

        response_text = get_url_response(url)

        if None is response_text:
            continue

        dom_tree      = etree.HTML(response_text)
        urls_list     = dom_tree.xpath("//a/@href")

        for url in urls_list:
            parse_info          = urlparse(url)
            parse_path          = parse_info.path
            parse_domain        = parse_info.netloc

            if (parse_domain == '' and parse_path and parse_path[0] == '/') \
                    or (parse_domain == 'www.xxx.com'):
                wait_crawl_url = domain + parse_path

                if wait_crawl_url not in wait_crawl_urls:
                    temp_piece_crawl_urls.append(wait_crawl_url)

    if temp_piece_crawl_urls:
        handle_divide_urls(temp_piece_crawl_urls)

    return


def get_url_response(url):
    if filter(url) == False:
        return None

    sleep(1)
    response = requests.get(url)
    code     = response.status_code

    if code != 200:
        error_urls.append(url)
        print ('Error %s, code : %d' % (url, code))
        return None

    print('url : %s, code : %d' % (url, code))

    if url not in done_urls:
        done_urls.append(url)

    if u'错误代码' in response.text:
        error_message = "Error: YII 页面错误发现 url : %s" % url
        print(error_message)
        #os.system(r'curl -X POST ...'.format(error_message)) 

    return response.text
    

def filter(url):
    ext = url.split('.')[-1]
    if (ext == 'com') or ('/' in ext):
        return True 

    return False 


if __name__ == '__main__':
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值