requests+bs4 使用css选择器(select)豆瓣

import requests
from requests.exceptions import RequestException
from lxml import etree
import time, json, csv,xlwt,xlrd
import pandas as pd
from xlutils.copy import copy
from bs4 import BeautifulSoup




# 第一步:#定义一个方法获取网页信息
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko)'
                          'Chrome/91.0.4472.114 Mobile Safari/537.36',
            # 'Cookie':'bid=W55k4D_fSXM; __utmz=30149280.1625041982.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __gads=ID=8362a35481680256-22421ee50bca00f1:T=1625041983:RT=1625041983:S=ALNI_MZ0vPA34VtqtmD29r6pJirZIU8xWQ; __utmz=81379588.1625042017.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gid=GA1.2.2054347723.1625042225; _ga=GA1.2.100943019.1625041982; __utma=30149280.100943019.1625041982.1625041982.1625125287.2; __utmc=30149280; __utmt_douban=1; __utma=81379588.444136571.1625042017.1625042017.1625125287.2; __utmc=81379588; __utmt=1; ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1625125288%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dz7cq9P4PInnWSErPqMmM4Lb6ZcQs4UjgaqSRLtPLfprPNKb3wDCbQYp3ZwLXM4BG%26wd%3D%26eqid%3Dd7960ea80001906f0000000560dc2c5b%22%5D; _pk_ses.100001.3ac3=*; __utmb=30149280.4.10.1625125287; __utmb=81379588.4.10.1625125287; _pk_id.100001.3ac3=6f14de07186baf65.1625042017.2.1625125324.1625042755.; dbcl2="207739408:CqN+9NKe/JI"'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.text)
            with open('douban_dushu_tab.html', 'a', encoding='utf-8') as f:
                f.write(response.text)

            return response.text

        else:
            return None
    except RequestException:
        return None


# 第二步:#定义一个方法使用xpath解析
def parse_one_page(html):
    # 传递两个参数:markup(解析的内容),features:解析器的类型(lxml,html.parser,xml,html5lib)
    soup = BeautifulSoup(markup=html, features='lxml')
    # //li[@class="subject-item"]//h2/a/text()
    # //li[@class="subject-item"]//span[@class="rating_nums"]/text()
    # select()方法是css 选择器中提供的方法
    books = soup.select('.subject-item')
    # print(books)
    time.sleep(3)

    result_lists = []
    for book in books:
        result_list = []
        # print(book.select('h2 a')[0].text)
        title = ''.join(map(lambda x: x.strip(), book.select('h2 a')[0].text))  # 合并副标题
        #print(title)
        href = book.select('h2 a')[0]['href']
        #print(href)
        rate = book.select('.rating_nums')
        evaluate = book.select('.pl')[0].text.strip()
        # print(evaluate)
        # print(title, rate)
        content = book.select('p')
        if len(rate) > 0 and len(content) > 0:
            rate = rate[0].text
            content = content[0].text
            result_list = [title, href, rate, evaluate, content]
            # print(result_list)
        elif len(rate) > 0:
            rate = rate[0].text
            content = ' '
            result_list = [title, href, rate, evaluate,content]
            # print(result_list)
        elif len(content) > 0:
            rate = ' '
            content = content[0].text
            result_list = [title, href, rate, evaluate, content]

        else:
            rate = ' '
            content = ' '
            result_list = [title, href, rate, evaluate, content]

        result_lists.append(result_list)

    print(result_lists)

    return result_lists



# 第三步: 写入文件txt
def write_to_file_txt(conent):
    # 写入文件
    with open('douban_dushu_top1.txt', 'a', encoding='utf-8') as f:
        # 以列表形式保存
        #f.write(','.join(conent))
        #f.write('\n' + '=' * 50 + '\n')

        # 以字典形式保存
        f.write(json.dumps(conent, indent=2, ensure_ascii=False))


# 第三步: 写入文件json
def write_to_file_json(conent):
    with open('douban_dushu_top1.json', 'a', encoding='utf-8') as f:
        # print(type(json.dump(conent)))
        # ensure_ascii=False设置可以输出中文
        f.write(json.dumps(conent, indent=2, ensure_ascii=False))


# 第三步: 写入文件csv
def write_to_file_csv(conent):
    '''
    :param conent:
    :return:
    '''
    #以pd方式写入csv时会出现编号,也是以字典形式传递
    '''df=pd.DataFrame(conent)
    df.to_csv('douban_dushu_top1.csv',encoding='utf-8')
    '''

    #以字典形式写入csv
    header = ['title','href','score','num','scrible']
    with open('douban_dushu_top1.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f,header)
        writer.writeheader()
        writer.writerows(conent)

    #以列表形式写入csv
    '''header = ['title', 'href', 'score', 'num', 'scrible']
    with open('douban_dushu_top.csv','a',encoding='utf-8') as f:
        writer= csv.writer(f,dialect='excel')
        writer.writerow(header)
        for item in conent:
            writer.writerow(item)
    '''

# 第三步: 写入文件excel
def write_excel_xls_hotal(path,sheet_name,content):
    index = len(content)# 获取需要写入数据的行数
    # 新建一个工作簿
    workbook=xlwt.Workbook(encoding='utf-8')
    # 在工作簿中新建一个表格
    sheet = workbook.add_sheet(sheet_name)
    # 像表格中写入数据(对应的行和列)
    for i in range(0,index):
        for j in range(0, len(content[i])):
            sheet.write(i,j,content[i][j])
    # 保存工作簿
    workbook.save(path)
    print("xls格式表格写入数据成功!")

#追加数据
def write_excel_xls_append(path, content):
    index = len(content)  # 获取需要写入数据的行数
    workbook = xlrd.open_workbook(path)  # 打开工作簿
    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
    new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
    new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
    for i in range(0, index):
        for j in range(0, len(content[i])):
            new_worksheet.write(i + rows_old, j, content[i][j])  # 追加写入数据,注意是从i+rows_old行开始写入

    new_workbook.save(path)  # 保存工作簿
    print("xls格式表格【追加】写入数据成功!")

# 第四步:翻页
def main(offset):
    # 每一页都是有规律的变化,都是offset的值在改变
    url = 'https://book.douban.com/tag/%E7%BC%96%E7%A8%8B?start={}&type=T'.format(offset * 20)
    print(url)
    html = get_one_page(url)
    items = parse_one_page(html)
    #以字典形式保存成csv格式
    #write_to_file_csv(items)
    #save_excel(items)
    return items


if __name__ == '__main__':
    #以列表形式写入excel
    book_name_xls = 'douban_dushu_tab.xls'
    sheet_name_xls = 'douban_dushu_tab'

    value_title = [['title','href','rate','evaluate','content'], ]
    write_excel_xls_hotal(book_name_xls, sheet_name_xls, value_title)
    for i in range(6):
        item=main(i)
        write_excel_xls_append(book_name_xls, item)
        time.sleep(2)


    #保存成json格式
    '''for i in range(2):
        items = main(i)
        #write_to_file_json(items)#以列表形式保存
        for item in items:#以字典形式保存
            write_to_file_json(item)

        time.sleep(2)
    '''


    # 保存成csv格式
    '''for i in range(2):
        items = main(i)
        write_to_file_csv(items)#以列表或者字典形式保存
        time.sleep(2)
    '''

    # 保存成txt格式
    '''for i in range(2):
        items = main(i)
        for item in items:
            write_to_file_txt(item)#以列表或者字典形式保存
        time.sleep(2)
    '''







  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值