小说旗 单篇文章采集

1、代理文档格式:(代理采集地址 http://www.xicidaili.com
这里写图片描述

2、免费代理稳定性不可靠,采用装饰器重连同时切换代理

# coding: utf-8
# pyhotn 2.7
# 小说棋 单篇小说采集 http://www.xs7.la/
# 替换第一章地址,总章节数。
# ip.txt 为代理池。
import urllib2
from bs4 import BeautifulSoup
import sys
import traceback
import random
import gzip

reload(sys)
sys.setdefaultencoding('utf-8')

f = open("out.txt", "a+")
headers = {
    "Host": "www.xs7.la",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Content-Type": "text/html",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "http://www.xs7.la/book/18_18966/",
    "Accept-Encoding": 'deflat'
}

url = "http://www.xs7.la/book/18_18966/7828246.html"  # 第一章网址
page = 184  # 章节数
nextHref = url

ipPool = []


def IPpool():
    reader = open('ip.txt')
    line = reader.readline()
    while line:
        if line.strip() != '':
            ipPool.append(line.split())
        line = reader.readline()
    reader.close()


RETRIES = 0
# 重试的次数
count = {"num": RETRIES}


def conn_try_again(function):
    def wrapped(*args, **kwargs):
        try:
            return function(*args, **kwargs)
        except Exception, err:
            print("--重试访问,当前次数 %s ,(总次数11)--" % (count['num'] + 1))
            if count['num'] < 10:
                count['num'] += 1
                return wrapped(*args, **kwargs)
            else:
                raise Exception(err)

    return wrapped


bsObj = None


#判断编码格式
def getCoding(strInput):
    '''
    获取编码格式
    '''
    if isinstance(strInput, unicode):
        return "unicode"
    try:
        strInput.decode("utf8")
        return 'utf8'
    except:
        pass
    try:
        strInput.decode("gbk")
        return 'gbk'
    except:
        pass




@conn_try_again
def getContent(url):
    global nextHref, page, bsObj
    # 定义一个代理开关
    proxySwitch = True
    try:
        poolLen = len(ipPool)
        if (poolLen > 0):
            i = random.randint(0, poolLen - 1)
            print(ipPool[i])
            proxy_host = ipPool[i][2] + "://" + ipPool[i][0] + ":" + ipPool[i][1]
            proxy_temp = {ipPool[i][2]: proxy_host}
            proxy_support = urllib2.ProxyHandler(proxy_temp)
        else:
            print('--代理池当前无可用代理,使用本机地址访问--')
            proxy_support = urllib2.ProxyHandler({})
        nullproxy_handler = urllib2.ProxyHandler({"http": "124.172.232.49:8010"})
        if proxySwitch:
            opener = urllib2.build_opener(proxy_support)
        else:
            opener = urllib2.build_opener(nullproxy_handler)

        urllib2.install_opener(opener)
        req = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(req, timeout=3)
        r = response.read()
        encode=getCoding(r)
        if(encode==None):
            print(response.info().get('Content-Encoding'))
            #gzip需要解压
        else :
            r = r.decode(encode)
        # print(r)
        bsObj = BeautifulSoup(r, 'lxml')
    except Exception, err:
        raise Exception(err)
    # print(bsObj)
    contentDiv = bsObj.find('div', id='content')
    content = bsObj.find('div', id='content').get_text()
    preAndNextBar = bsObj.find('div', id='thumb')
    title = bsObj.find('div', id='bgdiv').h1.get_text()
    if ("下一章" in preAndNextBar.get_text()):
        next = None
        aList = preAndNextBar.findAll('a')
        for i in aList:
            if ("下一章" in i.get_text()):
                next = i
        if (next == None):
            print("下一章为空")
            return True
        nextHref = next.get('href')
        print(title)
        # print(content)
        print(nextHref)
        f.write("#####" + '\n')
        f.write(title + '\n')
        f.write(content + '\n')
        count['num'] = 0
    else:
        return True


def main():
    IPpool()
    global page
    try:
        for num in range(1, page):
            if (getContent(nextHref)):
                break
        print("--- end ---")
    except Exception, e:
        print(traceback.print_exc())
    finally:
        f.close()


main()

附:代理采集 https://blog.csdn.net/u012795120/article/details/80857990
下载地址:https://download.csdn.net/download/u012795120/10534448

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值