#!/usr/bin/env python
# -*- coding:utf-8 -*-
import threading
import requests
import os
from random import shuffle
from time import sleep
from urlparse import urlparse
from lxml import etree
domain = "https://www.xxx.com"
piece_count = 3
wait_crawl_urls = [domain]
done_urls = []
error_urls = []
def main():
process_crawl()
def process_crawl():
for i in range(0, len(wait_crawl_urls) + piece_count, piece_count):
sub_thread = threading.Thread(target=handle_divide_urls, args=(wait_crawl_urls[i:i+piece_count],))
sub_thread.start()
def handle_divide_urls(piece_crawl_urls):
temp_piece_crawl_urls = []
for url in piece_crawl_urls:
if url in done_urls:
continue
response_text = get_url_response(url)
if None is response_text:
continue
dom_tree = etree.HTML(response_text)
urls_list = dom_tree.xpath("//a/@href")
for url in urls_list:
parse_info = urlparse(url)
parse_path = parse_info.path
parse_domain = parse_info.netloc
if (parse_domain == '' and parse_path and parse_path[0] == '/') \
or (parse_domain == 'www.xxx.com'):
wait_crawl_url = domain + parse_path
if wait_crawl_url not in wait_crawl_urls:
temp_piece_crawl_urls.append(wait_crawl_url)
if temp_piece_crawl_urls:
handle_divide_urls(temp_piece_crawl_urls)
return
def get_url_response(url):
if filter(url) == False:
return None
sleep(1)
response = requests.get(url)
code = response.status_code
if code != 200:
error_urls.append(url)
print ('Error %s, code : %d' % (url, code))
return None
print('url : %s, code : %d' % (url, code))
if url not in done_urls:
done_urls.append(url)
if u'错误代码' in response.text:
error_message = "Error: YII 页面错误发现 url : %s" % url
print(error_message)
#os.system(r'curl -X POST ...'.format(error_message))
return response.text
def filter(url):
ext = url.split('.')[-1]
if (ext == 'com') or ('/' in ext):
return True
return False
if __name__ == '__main__':
main()