import requests
from requests.exceptions import RequestException
from lxml import etree
import time, json, csv,xlwt,xlrd
import pandas as pd
from xlutils.copy import copy
from bs4 import BeautifulSoup
# 第一步:#定义一个方法获取网页信息
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/91.0.4472.114 Mobile Safari/537.36',
# 'Cookie':'bid=W55k4D_fSXM; __utmz=30149280.1625041982.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __gads=ID=8362a35481680256-22421ee50bca00f1:T=1625041983:RT=1625041983:S=ALNI_MZ0vPA34VtqtmD29r6pJirZIU8xWQ; __utmz=81379588.1625042017.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gid=GA1.2.2054347723.1625042225; _ga=GA1.2.100943019.1625041982; __utma=30149280.100943019.1625041982.1625041982.1625125287.2; __utmc=30149280; __utmt_douban=1; __utma=81379588.444136571.1625042017.1625042017.1625125287.2; __utmc=81379588; __utmt=1; ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1625125288%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dz7cq9P4PInnWSErPqMmM4Lb6ZcQs4UjgaqSRLtPLfprPNKb3wDCbQYp3ZwLXM4BG%26wd%3D%26eqid%3Dd7960ea80001906f0000000560dc2c5b%22%5D; _pk_ses.100001.3ac3=*; __utmb=30149280.4.10.1625125287; __utmb=81379588.4.10.1625125287; _pk_id.100001.3ac3=6f14de07186baf65.1625042017.2.1625125324.1625042755.; dbcl2="207739408:CqN+9NKe/JI"'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
with open('douban_dushu_tab.html', 'a', encoding='utf-8') as f:
f.write(response.text)
return response.text
else:
return None
except RequestException:
return None
# 第二步:#定义一个方法使用xpath解析
def parse_one_page(html):
# 传递两个参数:markup(解析的内容),features:解析器的类型(lxml,html.parser,xml,html5lib)
soup = BeautifulSoup(markup=html, features='lxml')
# //li[@class="subject-item"]//h2/a/text()
# //li[@class="subject-item"]//span[@class="rating_nums"]/text()
# select()方法是css 选择器中提供的方法
books = soup.select('.subject-item')
# print(books)
time.sleep(3)
result_lists = []
for book in books:
result_list = []
# print(book.select('h2 a')[0].text)
title = ''.join(map(lambda x: x.strip(), book.select('h2 a')[0].text)) # 合并副标题
#print(title)
href = book.select('h2 a')[0]['href']
#print(href)
rate = book.select('.rating_nums')
evaluate = book.select('.pl')[0].text.strip()
# print(evaluate)
# print(title, rate)
content = book.select('p')
if len(rate) > 0 and len(content) > 0:
rate = rate[0].text
content = content[0].text
result_list = [title, href, rate, evaluate, content]
# print(result_list)
elif len(rate) > 0:
rate = rate[0].text
content = ' '
result_list = [title, href, rate, evaluate,content]
# print(result_list)
elif len(content) > 0:
rate = ' '
content = content[0].text
result_list = [title, href, rate, evaluate, content]
else:
rate = ' '
content = ' '
result_list = [title, href, rate, evaluate, content]
result_lists.append(result_list)
print(result_lists)
return result_lists
# 第三步: 写入文件txt
def write_to_file_txt(conent):
# 写入文件
with open('douban_dushu_top1.txt', 'a', encoding='utf-8') as f:
# 以列表形式保存
#f.write(','.join(conent))
#f.write('\n' + '=' * 50 + '\n')
# 以字典形式保存
f.write(json.dumps(conent, indent=2, ensure_ascii=False))
# 第三步: 写入文件json
def write_to_file_json(conent):
with open('douban_dushu_top1.json', 'a', encoding='utf-8') as f:
# print(type(json.dump(conent)))
# ensure_ascii=False设置可以输出中文
f.write(json.dumps(conent, indent=2, ensure_ascii=False))
# 第三步: 写入文件csv
def write_to_file_csv(conent):
'''
:param conent:
:return:
'''
#以pd方式写入csv时会出现编号,也是以字典形式传递
'''df=pd.DataFrame(conent)
df.to_csv('douban_dushu_top1.csv',encoding='utf-8')
'''
#以字典形式写入csv
header = ['title','href','score','num','scrible']
with open('douban_dushu_top1.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f,header)
writer.writeheader()
writer.writerows(conent)
#以列表形式写入csv
'''header = ['title', 'href', 'score', 'num', 'scrible']
with open('douban_dushu_top.csv','a',encoding='utf-8') as f:
writer= csv.writer(f,dialect='excel')
writer.writerow(header)
for item in conent:
writer.writerow(item)
'''
# 第三步: 写入文件excel
def write_excel_xls_hotal(path,sheet_name,content):
index = len(content)# 获取需要写入数据的行数
# 新建一个工作簿
workbook=xlwt.Workbook(encoding='utf-8')
# 在工作簿中新建一个表格
sheet = workbook.add_sheet(sheet_name)
# 像表格中写入数据(对应的行和列)
for i in range(0,index):
for j in range(0, len(content[i])):
sheet.write(i,j,content[i][j])
# 保存工作簿
workbook.save(path)
print("xls格式表格写入数据成功!")
#追加数据
def write_excel_xls_append(path, content):
index = len(content) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(content[i])):
new_worksheet.write(i + rows_old, j, content[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
print("xls格式表格【追加】写入数据成功!")
# 第四步:翻页
def main(offset):
# 每一页都是有规律的变化,都是offset的值在改变
url = 'https://book.douban.com/tag/%E7%BC%96%E7%A8%8B?start={}&type=T'.format(offset * 20)
print(url)
html = get_one_page(url)
items = parse_one_page(html)
#以字典形式保存成csv格式
#write_to_file_csv(items)
#save_excel(items)
return items
if __name__ == '__main__':
#以列表形式写入excel
book_name_xls = 'douban_dushu_tab.xls'
sheet_name_xls = 'douban_dushu_tab'
value_title = [['title','href','rate','evaluate','content'], ]
write_excel_xls_hotal(book_name_xls, sheet_name_xls, value_title)
for i in range(6):
item=main(i)
write_excel_xls_append(book_name_xls, item)
time.sleep(2)
#保存成json格式
'''for i in range(2):
items = main(i)
#write_to_file_json(items)#以列表形式保存
for item in items:#以字典形式保存
write_to_file_json(item)
time.sleep(2)
'''
# 保存成csv格式
'''for i in range(2):
items = main(i)
write_to_file_csv(items)#以列表或者字典形式保存
time.sleep(2)
'''
# 保存成txt格式
'''for i in range(2):
items = main(i)
for item in items:
write_to_file_txt(item)#以列表或者字典形式保存
time.sleep(2)
'''
requests+bs4 使用css选择器(select)豆瓣
最新推荐文章于 2024-07-17 14:11:27 发布
这段代码展示了如何使用Python爬取豆瓣图书页面,提取书籍标题、链接、评分和评价,然后以CSV和JSON格式存储数据。核心是通过requests库获取网页,BeautifulSoup解析HTML,实现翻页并筛选关键信息。
摘要由CSDN通过智能技术生成