爬取豆瓣阅读最新文学金典数据并保存为CSV文件
import requests
import json
import csv
#请求网址!!!!post请求 data数据实现数据的获取
url="https://read.douban.com/j/kind/"
#浏览器请求头
header={"accept":"application/json",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Connection":"keep-alive",
"Content-Length":"1858",
"content-type":"application/json",
"Cookie":‘’; _pk_ref.100001.a7dd=%5B%22%22%2C%22%22%2C1560518698%2C%22https%3A%2F%2Fbook.douban.com%2Fsubject_search%3Fsearch_text%3D9787559817518%22%5D; ap_v=0,6.0; viewed="33426127_33372274_25942191"; gr_user_id=9f63c97f-ea15-407e-ac6b-bd0e7ea432b3; _vwo_uuid_v2=D23C4C0A76D2A3B516584D2AD59F5C3CB|31b91bf71b6938a3e9a81ebfb55d2d57; __utma=30149280.762004359.1543222766.1560518709.1560523365.8; __utmb=30149280.0.10.1560523365; __utmc=30149280; __utmz=30149280.1560523365.8.7.utmcsr=book.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/latest; _ga=GA1.3.762004359.1543222766; _gid=GA1.3.1719648228.1560513336; _pk_id.100001.a7dd=f0fd060f8031ec8b.1560513337.2.1560523560.1560516232.; _pk_ses.100001.a7dd=*',
"Host":"read.douban.com",
"Origin":"https://read.douban.com",
"Referer":"https://read.douban.com/category/?sort=rating&kind=1",
"User-Agent":‘’,
"x-csrf-token":"null",
"x-requested-with":"XMLHttpRequest"}
#获取json数据函数
def get_content(url,data,header):
html=requests.post(url,data=json.dumps(data),headers=header)
da=html.content.decode('utf8')
data_json=json.loads(da)
print(data_json)
content=data_json
return content
#保存CSV文件处理函数
def save(content,a):
base_url='https://read.douban.com'
with open(a,'a+',encoding='utf8',newline='') as fw:
doc=csv.writer(fw)
content_list=content['list']
n=len(content_list)
#处理author/origAuthor为空列表情况
for i in range(0,n):
if content_list[i]['author']==[] and content_list[i]['origAuthor']==[]:
author="暂无信息"
elif content_list[i]['author']==[]:
author=content_list[i]['origAuthor'][0]['name']
else:author=content_list[i]['author'][0]['name']
#标题 封面——url 拼接 url 摘要 字数拼接
doc.writerow([content_list[i]['title'],author,content_list[i]['cover'],base_url+str(content_list[i]['url']),content_list[i]['abstract'],str(content_list[i]['wordCount'])+str(content_list[i]['wordCountUnit'])])
n=1
while n<=146 :
#data重要,post要发送的参数
data={"sort":"new","page":n,"kind":131,"query":" query getFilterWorksList($works_ids: [ID!]) { worksList(worksIds: $works_ids) { title cover url isBundle url title author { name url } origAuthor { name url } translator { name url } abstract editorHighlight isOrigin kinds { name @skip(if: true) shortName @include(if: true) id } ... on WorksBase @include(if: true) { wordCount wordCountUnit } ... on WorksBase @include(if: false) { isEssay ... on EssayWorks { favorCount } isNew averageRating ratingCount url } ... on WorksBase @include(if: true) { isColumn isEssay onSaleTime ... on ColumnWorks { updateTime } } ... on WorksBase @include(if: true) { isColumn ... on ColumnWorks { isFinished } } ... on EssayWorks { essayActivityData { title uri tag { name color background icon2x icon3x iconSize { height } iconPosition { x y } } } } highlightTags { name } ... on WorksBase @include(if: false) { fixedPrice salesPrice isRebate } ... on EbookWorks { fixedPrice salesPrice isRebate } ... on WorksBase @include(if: true) { ... on EbookWorks { id isPurchased isInWishlist } } id isOrigin } } ","variables":{}}
content=get_content(url,data,header)
save(content,"小说.csv")
n+=1
获取数据如下图:
此次爬取主要遇到了post发送参数,难点就是data数据的获取与url不是浏览器地址栏的网址。
希望此次分享对大家有所帮助!