豆瓣小说爬虫

爬过了起点的小说之后,试着来爬豆瓣的小说信息,简单多了。
信息包括:URL,书名,作者,出版公司,出版日期,评分,评论人数,书的页数,价格,ISBN,标签,短评,1-5星的占比。
1.找到起始的URL
https://book.douban.com/tag/小说?start=0&type=T
简单分析一下可以知道,start=0 表示第一页,下一页时,start的值+20(start=20)
2.解析网页
requests+re+pyquery,获取网页源代码进行正则匹配获取所需的每本书的URL

def get_url_list(url):
    doc = pq(requests.get(url).text)
    book_url_list = re.compile(r'<a href="(.*?)" title=".*?" ').findall(str(doc))
    for i in book_url_list:
        bookurl.append(i)
    return bookurl

3.进入每本书的URL,解析页面后提取所需的信息(红色标注的)
在这里插入图片描述
获取各书信息的代码就不分别展示了,去整体代码里看吧。
整体代码

import requests
import re
import pymysql
import time
from pyquery import PyQuery as pq

#获取各书的URL
def get_url_list(url):
    doc = pq(requests.get(url).text)
    book_url_list = re.compile(r'<a href="(.*?)" title=".*?" ').findall(str(doc))
    for i in book_url_list:
        bookurl.append(i)
    return bookurl
#获取书名
def get_name(url):
    doc = pq(requests.get(url).text)
    name = re.compile(r'<meta name="keywords" content="(.*?)"').findall(str(doc))
    name = "".join(name).split(",")[0]
    print("书名:"+name)
    return name
#获取作者
def get_author(url):
    doc = pq(requests.get(url).text)
    author = re.compile(r'<meta name="keywords" content="(.*?)"').findall(str(doc))
    author = "".join(author).split(",")[1]
    print("作者:"+author)
    return author
#获取出版公司
def get_public_company(url):
    doc = pq(requests.get(url).text)
    company = re.compile(r'<meta name="keywords" content="(.*?)"').findall(str(doc))
    company = "".join(company).split(",")[2]
    print("出版公司:"+company)
    return company
#获取出版日期
def get_public_date(url):
    doc = pq(requests.get(url).text)
    date = re.compile(r'<meta name="keywords" content="(.*?)"').findall(str(doc))
    date = "".join(date).split(",")[3]
    print("出版日期:"+date)
    return date
#获取评分
def get_rate(url):
    doc = pq(requests.get(url).text)
    rate = re.compile(r'property="v:average">\s(.*?)\s</strong>').findall(str(doc))[0]
    print("评分:"+rate)
    return rate
#获取评论人数
def get_userCount(url):
    doc = pq(requests.get(url).text)
    Count = re.compile(r'<span property="v:votes">(.*?)</span>人评价</a>').findall(str(doc))[0]
    print("评论人数:"+Count)
    return Count
#获取页数
def get_pageNum(url):
    doc = pq(requests.get(url).text)
    pageNum = re.compile(r'<span class="pl">页数:</span>\s(.*?)<br/>').findall(str(doc))[0]
    print("页数:"+pageNum)
    return pageNum
#获取价格
def get_price(url):
    doc = pq(requests.get(url).text)
    price = re.compile(r'<span class="pl">定价:</span>\s(.*?)<br/>').findall(str(doc))[0]
    price = re.sub('[\u4e00-\u9fa5]','',price)
    print("价格:"+price+"元")
    return price
#获取ISBN
def get_ISBN(url):
    doc = pq(requests.get(url).text)
    isbn = re.compile(r'<span class="pl">ISBN:</span>\s(.*?)<br/>').findall(str(doc))[0]
    print("ISBN:"+isbn)
    return isbn
#获取标签
def get_tag(url):
    doc = pq(requests.get(url).text)
    tag = re.compile(r'<a class="  tag" href="/tag/(.*?)">').findall(str(doc))
    tag = ",".join(tag)
    print("标签:"+tag)
    return tag
#获取短评
def get_short(url):
    doc = pq(requests.get(url).text)
    short = re.compile(r'<span class="short">(.*?)</span>').findall(str(doc))
    short = " ".join(short)
    print("短评:"+short)
    return short
#获取1-5星
def get_stars(url):
    doc = pq(requests.get(url).text)
    stars = re.compile(r'<span class="rating_per">(.*?)</span>').findall(str(doc))
    return stars

#存入数据库
def mysql(url,name,author,company,date,rate,count,pagenum,price,isbn,tag,short,stars1,stars2,stars3,stars4,stars5):
    sql1 = "insert into book_DouBan(url,name,author,public_company,public_date,rate,userCount,pageNum,price,ISBN,tag,short,stars1,stars2,stars3,stars4,stars5) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(url,name,author,company,date,rate,count,pagenum,price,isbn,tag,pymysql.escape_string(short),stars1,stars2,stars3,stars4,stars5)
    cursor.execute(sql1)
    db.commit()
    print("存入数据库成功!")

if __name__ == '__main__':
    bookurl = []
    db = pymysql.connect(host='localhost', port=3306, user='root', password='123', db='spider', charset='utf8mb4')
    cursor = db.cursor()
    #0到100页
    for i in range(0,101):
        url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start="+str(i*20)+"&type=T"
        print(i)		#打印爬的页数
        print()			#空格
        for url in get_url_list(url):
            try:				#加一个异常处理,因为有些书没有页数或其他的信息,跳过。
                print("URL:"+url)
                name = get_name(url)
                author = get_author(url)
                company = get_public_company(url)
                date = get_public_date(url)
                rate = get_rate(url)
                count = get_userCount(url)
                pagenum = get_pageNum(url)
                price = get_price(url)
                isbn = get_ISBN(url)
                tag = get_tag(url)
                short = get_short(url)
                stars = get_stars(url)
                stars5 = stars[0]
                print("5星:"+stars5)
                stars4 = stars[1]
                print("4星:" + stars4)
                stars3 = stars[2]
                print("3星:" + stars3)
                stars2 = stars[3]
                print("2星:" + stars2)
                stars1 = stars[4]
                print("1星:" + stars1)
                mysql(url,name,author,company,date,rate,count,pagenum,price,isbn,tag,short,stars1,stars2,stars3,stars4,stars5)
                print()
                bookurl=[]
                time.sleep(3)		#延时3s
            except:
                pass

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值