单线程代理无间隔抓取

测试目标网站:jd

目标代理网站:xici

每2000次抓取更换一次proxy,程序运行时加载proxys到list中,依次pop(),若proxy_list为空则重新验证读取。若重新读取的proxy也都全部失效,则到代理网站从新抓取最新的proxy。

ps:一般网站对单线程无间隔抓取的容忍度显然大于多线程间隔抓取。

__author__ = 'huafeng'
#coding:utf-8
import os
import re
import codecs
import urllib2
import xici_proxy
from bs4 import BeautifulSoup

PATH = os.path.dirname(os.path.abspath(__file__))

def read_proxy_file():
    proxy_list = []
    filename = os.path.join(PATH, 'sys', 'xici_proxy')
    with codecs.open(filename, encoding='utf-8')as f:
        proxy_list.extend([item.strip() for item in f.readlines()])
    return proxy_list
def gen_whole_item_id():
    proxy_list = read_proxy_file()
    proxy_count = len(proxy_list)
    page_url_proxy_count = 0
    if not proxy_list:
        xici_proxy.gen_proxy()
        proxy_list = read_proxy_file()
    ip_port = proxy_list.pop()
    whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url')
    timeout_page_url_filename = os.path.join(PATH, 'log', 'timeout_page_url')
    item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids')
    page_url_crawled_filename = os.path.join(PATH, 'log', 'crawled_page_url')
    with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\
    codecs.open(item_id_filename, mode='wb', encoding='utf-8')as item_id_wf,\
    codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\
    codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf:
        for page_url in [item.strip() for item in whole_page_url_f.readlines()]:
            page_url_proxy_count += 1
            try:
                if page_url_proxy_count > 2000:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write('get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                        page_url_proxy_count = 0
                http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                opener = urllib2.build_opener(http_hanlder)
                html = opener.open(page_url, timeout=15)
            except urllib2.HTTPError, e:#jd不会有forbidden网页,会返回登陆页面,所以此逻辑运行可忽略
                if e.getcode() == 403:
                    timeout_url_wf.write('403 error:request forbiddon!!!\n')
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write('get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                    http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                    opener = urllib2.build_opener(http_hanlder)
                    html = opener.open(page_url, timeout=15).read().decode('gbk')
                else:
                    continue
            except:
                timeout_info = ''.join(('request_timeout:', page_url, '\n'))
                timeout_url_wf.write(timeout_info)
                continue
            soup = BeautifulSoup(html)
            div_level_str = soup.find('div', id='plist')
            if not div_level_str:
                error_match_info = ''.join(('no_plist_div:', page_url, '\n'))
                timeout_url_wf.write(error_match_info)
                continue
            div_item_list = div_level_str.find_all('div', class_='item')
            if not div_item_list:
                error_match_info = ''.join(('no_item_div:', page_url, '\n'))
                timeout_url_wf.write(error_match_info)
                continue
            item_id_list = [item['sku']+'\n' for item in div_item_list]
            item_id_wf.writelines(item_id_list)
            crawled_url_wf.write(page_url+'\n')
            # time.sleep(3)
# gen_whole_item_id()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值