这几天在手机上看《蜀山剑侠传》 看到一半 突然让付费,还是自己抓取文字嘛
就模仿着做了个简单的抽取, 最终结果存在文件里
__author__ = 'allen'
import urllib
import urllib2
import re
import chardet
import os
from bs4 import BeautifulSoup
import sys
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf-8')
def get_page_str(page_num):
if page_num < 10:
return '0' + str(page_num)
return str(page_num)
def get_huanzhu_url(page_num):
return 'http://www.my285.com/wuxia/hzlz/ssj3/' + get_page_str(page_num)+'.htm'
data_save = open('data.txt', 'wb+')
def spider_page(page_num):
url = get_huanzhu_url(page_num)
print(url)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read()
content = unicode(content, 'gbk')
content = content.replace('<br>', '')
soup = BeautifulSoup(content, 'html.parser')
tmp = soup.find_all(lambda tag: tag.name=='td' and len(tag.attrs) == 1
and tag.has_key('colspan'))
length = len(tmp)
index = 0;
for data in tmp:
if index >= 1:
break
index = index + 1
data_str = data.string
data_str = data_str.replace('<td colspan="2">', '')
data_str = data_str.replace('</td>', '')
data_save.write(data_str)
return True
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
return False
if hasattr(e,"reason"):
print e.reason
return False
page_num = 0
while True:
page_num = page_num + 1
res = spider_page(page_num)
if res == False:
break
print(page_num)
data_save.close()