python学习笔记1豆瓣图书信息下载保存至csv

最新推荐文章于 2024-04-21 15:05:18 发布

dengqueqian2147

最新推荐文章于 2024-04-21 15:05:18 发布

阅读量249

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/yueyuecong/p/11491085.html

版权

还需添加ip池未实现

import requests
from bs4 import BeautifulSoup
import re #正则规范信息
import csv #保存至.csv
import random
import time #生成随机秒数，反爬
aurl = 'https://book.douban.com/tag/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
res = requests.get(aurl,headers = headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'html.parser')
calss1 = soup.select('#content > div > div.article > div:nth-child(2) > div > table > tbody > tr > td > a')
for calss2 in calss1 :
    tag = calss2.get_text().strip()
    for tage in range(0,41,20):
        dat = {
            'start':tage,
            type:'T'
        }
        url = 'https://book.douban.com/tag/%s' % tag
        #每个url下载3页信息，下载一页保存一页 
        books = []
        response = requests.get(url, headers=headers,params=dat)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        names = soup.select('#subject_list > ul > li > div.info > h2 > a')
        details = soup.select('#subject_list > ul > li > div.info > div.pub')
        scores = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums')
        comments = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.pl')
        for name, detail, score, comment in zip(names, details, scores, comments):
            try:

                dict_book = {}
                name2 = name.get_text()
                name = ''.join(name2.split())
                detail = detail.get_text().split('/')
                author = detail[0].strip()
                pubtime = detail[-2].strip()
                price2 = detail[-1].strip()
                price1 = re.findall(r'(\d+\.\d*).*', price2)  # 正则取出数值，保持格式一致
                price = price1[0]
                score = score.get_text()
                comment2 = comment.get_text().strip()
                comment1 = re.findall(r'(\d+)\D*', comment2)
                comment = comment1[0]
                dict_book['书名'] = name
                dict_book['作者'] = author
                dict_book['上市时间'] = pubtime
                dict_book['价格'] = price
                dict_book['书籍评分'] = score
                dict_book['评分人数'] = comment
                books.append(dict_book)
                time.sleep(random.random() * 3)  # 随机休眠
            except IndexError as e:
                print('IndexError:', e)
            finally:
                print(name)
        # errors='ignore' 避免格式错误导致写入循环终止
        with open(r'fileName.csv', 'w', errors='ignore') as csvfile:
            filednames = ['书名', '作者', '上市时间', '价格', '书籍评分', '评分人数']
            # 以字典格式写入 filednames
            writer = csv.DictWriter(csvfile, filednames)
            # 写入 filednames
            writer.writeheader()
            for book_ in books:
                writer.writerow({
                    '书名': book_['书名'], '作者': book_['作者'], '上市时间': book_['上市时间'], '价格': book_['价格'],
                    '书籍评分': book_['书籍评分'], '评分人数': book_['评分人数']
                })

转载于:https://www.cnblogs.com/yueyuecong/p/11491085.html