我们要爬取的是豆瓣电影上的所有电影信息
访问网址https://movie.douban.com/ 选择更多电影
跳转到https://movie.douban.com/explore#!type=movie&tag=热门&sort=recommend&page_limit=20&page_start=0
经分析获取信息的链接为
https://movie.douban.com/j/search_subjects?
type=视频类型&
tag=内容分类&
sort=视频排序规则&
page_limit=一次返回多少条数据&
page_start=加载更多 对应该参数20倍数递增
tag:
sort:
page_start:
链接为 https://movie.douban.com/j/search_subjects?type=movie&tag=恐怖&sort=recommend&page_limit=20&page_start=0
代码思路很清晰
1.首先构造链接
2.发送请求,获取响应
3.提取数据
4.保存
5.构造下一页的url页面,循环~
每一步对应相应的函数
代码:
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import requests
import json
class DoubanSpider:
def __init__(self):
self.url_temp = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%81%90%E6%80%96&sort=recommend&page_limit=20&page_start={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
def parse_url(self, url): # 发送请求,获取响应
print(url)
response = requests.get(url, headers=self.headers)
print(response)
return response.content.decode()
def get_content_list(self, json_str): # 提取数据
# content_list = json_str["subjects"]
dict_ret = json.loads(json_str)
content_list = dict_ret["subjects"]
return content_list
def save_content_list(self, content_list): # 保存
with open("douban.txt", "a", encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False))
f.write("\n") # 写入换行符,进行换行
print("保存成功")
def run(self):
num = 0
while True:
# 1.start_url
url = self.url_temp.format(num)
# 2.发送请求,获取响应
json_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(json_str)
# 4.保存
self.save_content_list(content_list)
if len(content_list) < 20:
break
# 5.构造下一页的url页面,进入循环
num += 20
if __name__ == '__main__':
douban_spider = DoubanSpider()
douban_spider.run()
部分内容:
{"rate": "6.3", "cover_x": 960, "title": "寂静之地", "url": "https://movie.douban.com/subject/26997663/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2518852413.jpg", "id": "26997663", "cover_y": 1500, "is_new": false}
{"rate": "7.3", "cover_x": 2764, "title": "小丑回魂", "url": "https://movie.douban.com/subject/3604148/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2452537144.jpg", "id": "3604148", "cover_y": 4096, "is_new": false}
{"rate": "6.1", "cover_x": 950, "title": "昆池岩", "url": "https://movie.douban.com/subject/26945085/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2513360824.jpg", "id": "26945085", "cover_y": 1361, "is_new": false}
{"rate": "7.1", "cover_x": 2024, "title": "遗传厄运", "url": "https://movie.douban.com/subject/27621727/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2518865763.jpg", "id": "27621727", "cover_y": 3000, "is_new": false}
{"rate": "7.9", "cover_x": 694, "title": "解除好友2:暗网", "url": "https://movie.douban.com/subject/26725678/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2525020357.jpg", "id": "26725678", "cover_y": 1000, "is_new": false}
{"rate": "6.8", "cover_x": 1327, "title": "电锯惊魂8:竖锯", "url": "https://movie.douban.com/subject/25788426/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2499761124.jpg", "id": "25788426", "cover_y": 2048, "is_new": false}
{"rate": "5.9", "cover_x": 2764, "title": "修女", "url": "https://movie.douban.com/subject/26825664/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2528357976.jpg", "id": "26825664", "cover_y": 4096, "is_new": false}
{"rate": "7.3", "cover_x": 2000, "title": "寄宿学校", "url": "https://movie.douban.com/subject/30201003/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2533613106.jpg", "id": "30201003", "cover_y": 3000, "is_new": false}
{"rate": "7.3", "cover_x": 1126, "title": "异形:契约", "url": "https://movie.douban.com/subject/11803087/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2459944375.jpg", "id": "11803087", "cover_y": 1600, "is_new": false}
{"rate": "6.5", "cover_x": 1382, "title": "安娜贝尔2:诞生", "url": "https://movie.douban.com/subject/26644205/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2465524814.jpg", "id": "26644205", "cover_y": 2048, "is_new": false}
{"rate": "7.1", "cover_x": 1976, "title": "忌日快乐", "url": "https://movie.douban.com/subject/27027913/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2510057340.jpg", "id": "27027913", "cover_y": 3000, "is_new": false}
{"rate": "7.0", "cover_x": 1328, "title": "阴风阵阵", "url": "https://movie.douban.com/subject/3095514/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532759044.jpg", "id": "3095514", "cover_y": 2048, "is_new": false}
{"rate": "7.1", "cover_x": 663, "title": "鬼故事", "url": "https://movie.douban.com/subject/26721644/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2513009586.jpg", "id": "26721644", "cover_y": 1000, "is_new": false}
{"rate": "6.5", "cover_x": 2416, "title": "异星觉醒", "url": "https://movie.douban.com/subject/26718838/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2456571580.jpg", "id": "26718838", "cover_y": 3499, "is_new": false}
{"rate": "6.0", "cover_x": 3158, "title": "月光光心慌慌", "url": "https://movie.douban.com/subject/4017146/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2519765769.jpg", "id": "4017146", "cover_y": 5000, "is_new": false}
{"rate": "6.9", "cover_x": 740, "title": "噩梦娃娃屋", "url": "https://movie.douban.com/subject/27012731/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2514997577.jpg", "id": "27012731", "cover_y": 1000, "is_new": false}
{"rate": "6.1", "cover_x": 1293, "title": "潜伏4:锁命亡灵", "url": "https://movie.douban.com/subject/26794994/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2498335046.jpg", "id": "26794994", "cover_y": 2048, "is_new": false}
{"rate": "8.4", "cover_x": 1429, "title": "摄影机不要停!", "url": "https://movie.douban.com/subject/30234315/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2541824676.jpg", "id": "30234315", "cover_y": 2048, "is_new": false}
{"rate": "6.0", "cover_x": 1476, "title": "失眠", "url": "https://movie.douban.com/subject/26748223/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2497145485.jpg", "id": "26748223", "cover_y": 2067, "is_new": false}
{"rate": "6.5", "cover_x": 2007, "title": "报告老师!怪怪怪怪物!", "url": "https://movie.douban.com/subject/26720627/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2460047543.jpg", "id": "26720627", "cover_y": 2858, "is_new": false}
{"rate": "6.8", "cover_x": 841, "title": "诡怪疑云", "url": "https://movie.douban.com/subject/27188152/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2532529059.jpg", "id": "27188152", "cover_y": 1181, "is_new": false}
{"rate": "6.5", "cover_x": 1434, "title": "红衣小女孩2", "url": "https://movie.douban.com/subject/26928198/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2447040173.jpg", "id": "26928198", "cover_y": 2048, "is_new": false}
{"rate": "7.3", "cover_x": 1415, "title": "马柔本宅秘事", "url": "https://movie.douban.com/subject/26961821/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2456908524.jpg", "id": "26961821", "cover_y": 2048, "is_new": false}
{"rate": "6.2", "cover_x": 1012, "title": "杰罗德游戏", "url": "https://movie.douban.com/subject/26602795/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2499048409.jpg", "id": "26602795", "cover_y": 1500, "is_new": false}
{"rate": "6.5", "cover_x": 2150, "title": "生化危机:复仇", "url": "https://movie.douban.com/subject/26876342/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2464409926.jpg", "id": "26876342", "cover_y": 3041, "is_new": false}
{"rate": "6.0", "cover_x": 1895, "title": "真心话大冒险", "url": "https://movie.douban.com/subject/26998315/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2509632893.jpg", "id": "26998315", "cover_y": 3000, "is_new": false}
{"rate": "5.8", "cover_x": 976, "title": "鬼话怪谈·祥云寺", "url": "https://movie.douban.com/subject/26903992/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2434618486.jpg", "id": "26903992", "cover_y": 1280, "is_new": false}
{"rate": "6.1", "cover_x": 1011, "title": "黑森灵", "url": "https://movie.douban.com/subject/27078344/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2512008984.jpg", "id": "27078344", "cover_y": 1500, "is_new": false}
{"rate": "8.0", "cover_x": 706, "title": "狼屋", "url": "https://movie.douban.com/subject/26754734/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2534473847.jpg", "id": "26754734", "cover_y": 1000, "is_new": false}
{"rate": "6.4", "cover_x": 793, "title": "黑夜吞噬世界", "url": "https://movie.douban.com/subject/30136177/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2512382777.jpg", "id": "30136177", "cover_y": 1080, "is_new": false}
{"rate": "4.8", "cover_x": 882, "title": "午夜凶铃3(美版)", "url": "https://movie.douban.com/subject/1584991/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2376001912.jpg", "id": "1584991", "cover_y": 1377, "is_new": false}
{"rate": "5.1", "cover_x": 1500, "title": "活尸电梯", "url": "https://movie.douban.com/subject/27089293/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2528505860.jpg", "id": "27089293", "cover_y": 2143, "is_new": false}
{"rate": "6.3", "cover_x": 1920, "title": "使徒", "url": "https://movie.douban.com/subject/26908051/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2545835930.jpg", "id": "26908051", "cover_y": 2844, "is_new": false}
{"rate": "6.7", "cover_x": 1500, "title": "辣手保姆", "url": "https://movie.douban.com/subject/26704621/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2500890954.jpg", "id": "26704621", "cover_y": 2222, "is_new": false}
{"rate": "5.2", "cover_x": 2071, "title": "温彻斯特", "url": "https://movie.douban.com/subject/3387688/", "playable": false, "cover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2512558317.jpg", "id": "3387688", "cover_y": 3000, "is_new": false}
{"rate": "5.4", "cover_x": 1383, "title": "人皮脸", "url": "https://movie.douban.com/subject/25958867/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2499167862.jpg", "id": "25958867", "cover_y": 2048, "is_new": false}
{"rate": "5.9", "cover_x": 1382, "title": "黑夜造访", "url": "https://movie.douban.com/subject/26815381/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2424852564.jpg", "id": "26815381", "cover_y": 2047, "is_new": false}
{"rate": "5.6", "cover_x": 1000, "title": "猖獗", "url": "https://movie.douban.com/subject/26975011/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2534697355.jpg", "id": "26975011", "cover_y": 1425, "is_new": false}
{"rate": "5.4", "cover_x": 690, "title": "救僵清道夫", "url": "https://movie.douban.com/subject/26759783/", "playable": true, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2416595976.jpg", "id": "26759783", "cover_y": 1009, "is_new": false}
{"rate": "6.0", "cover_x": 1406, "title": "灵蚀", "url": "https://movie.douban.com/subject/27126455/", "playable": false, "cover": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2499576913.jpg", "id": "27126455", "cover_y": 2000, "is_new": false}
如果想同时获取不同类型的电影数据,我们可以修改url_temp将其变为一个列表
self.url_temp_list = [
"https://movie.douban.com/j/search_subjects?type=movie&tag=科幻&sort=recommend&page_limit=20&page_start={}",
"https://movie.douban.com/j/search_subjects?type=movie&tag=喜剧&sort=recommend&page_limit=20&page_start={}",
"https://movie.douban.com/j/search_subjects?type=movie&tag=恐怖&sort=recommend&page_limit=20&page_start={}",
]
def run(self):
num = 0
while True:
for url_temp in self.url_temp_list:
# 1.start_url
url = url_temp.format(num)
# 2.发送请求,获取响应
json_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(json_str)
# 4.保存
self.save_content_list(content_list)
if len(content_list) < 19:
break
# 5.构造下一页的url页面,进入循环
num += 20
如果想在文件中保存其对应的类型,添加字段可以正则匹配到或者
可以修改url_temp为一个字典
self.url_temp = {
"url_temp":"https://movie.douban.com/j/search_subjects?type=movie&tag=科幻&sort=recommend&page_limit=20&page_start={}",
"tag":"科幻"
},
{
"url_temp": "https://movie.douban.com/j/search_subjects?type=movie&tag=喜剧&sort=recommend&page_limit=20&page_start={}",
"tag": "喜剧"
},
{
"url_temp": "https://movie.douban.com/j/search_subjects?type=movie&tag=恐怖&sort=recommend&page_limit=20&page_start={}",
"tag": "恐怖"
}
def save_content_list(self, content_list,tag): # 保存
with open("douban.txt", "a", encoding="utf-8") as f:
for content in content_list:
content["tag"] = tag
f.write(json.dumps(content, ensure_ascii=False))
f.write("\n") # 写入换行符,进行换行
print("保存成功")
遍历字典(不能对字典format)
def run(self):
num = 0
while True:
for url_temp in self.url_temp:
# 1.start_url
url = url_temp["url_temp"].format(num)
# 2.发送请求,获取响应
json_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(json_str)
# 4.保存
self.save_content_list(content_list,url_temp["tag"])
if len(content_list) < 20:
break
# 5.构造下一页的url页面,进入循环
num += 20