from bs4 import BeautifulSoup
import re
import urllib.request
import xlwt
import sqlite3
def main():
baseurl ='http://www.quanshuwang.com/all/lastupdate_0_0_0_0_0_0_'
datalist = getdata(baseurl)
savepath = 'quanshuwang.xls'
savedata(datalist,savepath)
dbpath = "quanshuwang.db"
saveData2DB(datalist,dbpath)
findtlink = re.compile(r'<a href="(.*?)" target="')
findtitle = re.compile(r'<h2>(.*?)</h2>')
findauthor = re.compile(r'<dd><p>作者:(.*?)</p></dd>')
findimg = re.compile(r'<img.*src="(.*?)"',re.S)
def askurl(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
request = urllib.request.Request(url,headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('gbk')
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
return html
def getdata(baseurl):
datalist = []
for i in range(0,5):
url = baseurl + str(i) + '.html'
html = askurl(url)
soup = BeautifulSoup(html,'html.parser')
#Title
for item in soup.findAll('div',class_="yd-book-item yd-book-item-pull-left"):
data = []
item = str(item)
link = re.findall(findtlink,item)
data.append(link)
title = re.findall(findtitle,item)
data.append(title)
author = re.findall(findauthor,item)
data.append(author)
img = re.findall(findimg,item)
data.append(img)
datalist.append(data)
return datalist
def savedata(datalist,savepath):
print('save...')
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet("sheet1",cell_overwrite_ok=True)
col = ['小说链接','标题','作者','图片']
for i in range(0,4):
sheet.write(0,i,col[i])
for i in range(0,175):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0,4):
sheet.write(i+1,j,data[j])
book.save(savepath)
if __name__ == '__main__':
main()
python爬虫入门项目(1):全书网

最新推荐文章于 2021-03-30 11:49:23 发布
