专栏集合(豆瓣时间)数据爬取:
- 引入 requests 包
- 添加 url、UA
- 创建 requests.get 请求获取响应的 ajax 数据
- 输入文件名,并将数据持久化
import json
import requests
import requests
if __name__ == '__main__':
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/all?for_mobile=1'
url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/18?for_mobile=1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
kw = input('enter a word:')
response = requests.get(url=url,headers=headers)
text_json = response.json()
filename = kw + '.json'
fp = open(filename, 'w', encoding='utf-8')
json.dump(text_json, fp=fp, ensure_ascii=False)
print(filename, '保存成功')
观察一波,发现这个页面的接口数据存在一定的规律,进行整个专栏数据爬取
import json
import requests
import requests
if __name__ == '__main__':
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/18?for_mobile=1'
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/26/columns?start=0&count=20&for_mobile=1'
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/24/columns?start=0&count=20&for_mobile=1'
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/31/columns?start=0&count=20&for_mobile=1'
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/7/columns?start=0&count=20&for_mobile=1'
# url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/58/columns?start=0&count=20&for_mobile=1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
for i in range(58):
url = 'https://m.douban.com/rexxar/api/v2/niffler/collection/%d/columns?start=0&count=20&for_mobile=1' % i
print(url)
response = requests.get(url=url,headers=headers)
if response.ok == True:
text_json = response.json()
filename = '%d.json' % i
fp = open(filename, 'w', encoding='utf-8')
json.dump(text_json, fp=fp, ensure_ascii=False)
print(filename, '保存成功')
后续考虑存到数据库