python爬虫入门项目(1):全书网

from bs4 import BeautifulSoup
import re
import urllib.request
import xlwt
import sqlite3
def main():
    baseurl ='http://www.quanshuwang.com/all/lastupdate_0_0_0_0_0_0_'
    datalist = getdata(baseurl)
    savepath = 'quanshuwang.xls'
    savedata(datalist,savepath)
    dbpath = "quanshuwang.db"
    saveData2DB(datalist,dbpath)

findtlink = re.compile(r'<a href="(.*?)" target="')
findtitle = re.compile(r'<h2>(.*?)</h2>')
findauthor = re.compile(r'<dd><p>作者:(.*?)</p></dd>')
findimg = re.compile(r'<img.*src="(.*?)"',re.S)

def askurl(url):
    headers = {
      "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
    }
    request = urllib.request.Request(url,headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('gbk')
    except urllib.error.URLError as e:
        if hasattr(e,'code'):
            print(e.code)
        if hasattr(e,'reason'):
            print(e.reason)
    return html

def getdata(baseurl):
    datalist = []
    for i in range(0,5):
        url = baseurl + str(i) + '.html'
        html = askurl(url)
        soup = BeautifulSoup(html,'html.parser')
        #Title
        for item in soup.findAll('div',class_="yd-book-item yd-book-item-pull-left"):
            data = []
            item = str(item)

            link = re.findall(findtlink,item)
            data.append(link)

            title = re.findall(findtitle,item)
            data.append(title)

            author = re.findall(findauthor,item)
            data.append(author)

            img = re.findall(findimg,item)
            data.append(img)

            datalist.append(data)

    return datalist

def savedata(datalist,savepath):
    print('save...')
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet("sheet1",cell_overwrite_ok=True)
    col = ['小说链接','标题','作者','图片']
    for i in range(0,4):
        sheet.write(0,i,col[i])
    for i in range(0,175):
        print("第%d条"%(i+1))
        data = datalist[i]
        for j in range(0,4):
            sheet.write(i+1,j,data[j])
    book.save(savepath)
if __name__ == '__main__':
    main()

将图片保存到xls文件中

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值