刚接触Python,爬虫试试,捯饬了一晚上,终于搞定了。
给自己鼓掌`~
获取的一部小说雪鹰领主的第一章内容。,
今天先发上来,明天再加注释。
参考资料:http://cuiqingcai.com/990.html
#coding:utf-8
import urllibimport urllib2
import re
page = 1
url = "http://www.biquge.tw/9_9080/5134179.html"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
# pattern = re.compile('<div id="content">(.*?)</div>',re.S)
# pattern = re.compile('<div id="content">(.*?)<script',re.S)
pattern = re.compile('<div id="content">(.*?)<script',re.S)
items = re.findall(pattern,content)
#
for item in items:
newitem=re.sub(r'<br/>',"\n",item)
# haveImg = re.search("img",item[3])
# if not haveImg:
print newitem #这个可以正常使用
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason