很久以前自学python写的爬虫,爬的是www.biqiuge.com的圣墟,更换get_page_start的书号和少许正则可以爬其他小说
因为是那会自学写的爬虫,有点没有章法,请见谅
import requests
from requests.exceptions import ConnectionError
import re
def get_page_start():
page='http://www.biqiuge.com/book/4772/'
try:
response = requests.get(page)
if response.status_code==200:
return response.text;
except ConnectionError():
print('Error')
def parse_page_start(text):
#.*?<dt>《圣墟》正文卷</dt>.*?<dd>.*?<a.*?href=\"(.*?)\">(.*?)</a>.*?
txt=re.compile("正文卷</dt>(.*)",re.S).findall(text)
patterns=re.compile(".*?<dd><a href =\"(.*?)\">(.*?)</a></dd>",re.S)
result=patterns.findall(txt)
return result
def get_page_final(result,page):
for i in result:
pageright=i[1].replace('biqiuge.com','')
try:
response = requests.get(page+i[0])
print(page+i[0])
if response.status_code==200:
txt=parse_page_final(response.text)
txt="".join(txt).replace(" ", "")
txt=txt.replace("<br />","\n")
txt=txt.replace(";;;;;;;;"," ")
txt=txt.replace("请记住本书首发域名:www.biqiuge.com。笔趣阁手机版阅读网址:m.biqiuge.com","")
path="d:/txt/"+pageright+".txt"
f=open(path,"w")
f.write(txt)
print("------------------------------")
except ConnectionError():
print('Error')
def parse_page_final(text):
patterns=re.compile("class=\"showtxt\">(.*?)</div>", re.S)
return patterns.findall(text)[0]
get_page_final(parse_page_start(get_page_start()), 'http://www.biqiuge.com')
结果截图: