python爬虫抽取武侠小说

这几天在手机上看《蜀山剑侠传》 看到一半 突然让付费,还是自己抓取文字嘛

就模仿着做了个简单的抽取,   最终结果存在文件里

__author__ = 'allen'

import urllib
import urllib2
import re
import chardet
import os

from bs4 import BeautifulSoup

import sys
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf-8')

def get_page_str(page_num):
    if page_num < 10:
        return '0' + str(page_num)
    return  str(page_num)

def get_huanzhu_url(page_num):
    return 'http://www.my285.com/wuxia/hzlz/ssj3/' + get_page_str(page_num)+'.htm'

data_save = open('data.txt', 'wb+')

def spider_page(page_num):
    url = get_huanzhu_url(page_num)
    print(url)
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    try:
        request = urllib2.Request(url,headers = headers)
        response = urllib2.urlopen(request)
        content = response.read()
        content = unicode(content, 'gbk')
        content = content.replace('<br>', '')
        soup = BeautifulSoup(content, 'html.parser')
        tmp = soup.find_all(lambda tag: tag.name=='td' and len(tag.attrs) == 1
                            and tag.has_key('colspan'))
        length = len(tmp)
        index = 0;
        for data in tmp:
            if index >= 1:
                break
            index = index + 1
            data_str = data.string
            data_str = data_str.replace('<td colspan="2">', '')
            data_str = data_str.replace('</td>', '')
            data_save.write(data_str)
        return True
    except urllib2.URLError, e:
        if hasattr(e,"code"):
            print e.code
            return False
        if hasattr(e,"reason"):
            print e.reason
            return False

page_num = 0
while True:
    page_num = page_num + 1
    res = spider_page(page_num)
    if res == False:
        break
    print(page_num)
data_save.close()



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值