python爬取豆瓣电影

最新推荐文章于 2022-06-07 16:58:04 发布

哈理工胡歌

最新推荐文章于 2022-06-07 16:58:04 发布

阅读量1.5k

点赞数 1

分类专栏： python 文章标签： scrapy

本文链接：https://blog.csdn.net/qq_36045385/article/details/81912299

版权

python 专栏收录该内容

24 篇文章

订阅专栏

如果爬取豆瓣电影时，IP被封，就连接其他wifi爬，比如手机热点。好可以使用付费代理IP

爬取豆瓣电影中：华语、欧美、韩国、日本电影每个标签下按评价排序的全部电影。需要如下信息：

（1）每个电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长，电影评分，以及每个星级评分的百分比数据。

（2）每个电影热门点评中的前100个评分及其评分人。

（3）进入每个评分人的主页，爬取其看过的电影信息，以及对电影的评分。（少于300部则全部爬取，多于300部则仅爬取前300个）

用Scrapy框架写

items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MovieInfo(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长，电影评分，以及每个星级评分的百分比数据。
name=scrapy.Field()
director=scrapy.Field()
adaptor=scrapy.Field()
rolename=scrapy.Field()
type=scrapy.Field()
country=scrapy.Field()
date=scrapy.Field()
length=scrapy.Field()
grade=scrapy.Field()
startrate=scrapy.Field()
url=scrapy.Field()
class MovieContent(scrapy.Item):
title=scrapy.Field()
people=scrapy.Field()
grade=scrapy.Field()
class MovieSelf(scrapy.Item):
user=scrapy.Field()
movie=scrapy.Field()
selfgrade=scrapy.Field()

douban.py文件

# -*- coding: utf-8 -*-
import scrapy
import json
from NewDouBan.items import MovieInfo,MovieContent,MovieSelf
from urllib import urlencode
import time
class NewDouBan(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
data = {
'type': 'movie',
'tag': '华语',
'sort': 'rank',
'page_limit': 20,
'page_start': 0
}
start_urls = ['https://movie.douban.com/j/search_subjects?' + urlencode(data)]

def parse(self, response):
data = json.loads(response.text)["subjects"]
for each in data:
url2=each['url']
yield scrapy.Request(url2, callback=self.parse_item)

# 处理每个帖子里
def parse_item(self, response):

item = MovieInfo()

# 标题
item['name'] =response.xpath("//h1/span[@property]/text()").extract()[0]
item['rolename']=response.xpath("//div[@id='info']//span[@class]/a[@rel='v:starring']/text()").extract()
# 编号
item['director'] = response.xpath("//div[@id='info']//span[@class='attrs']/a[@rel]/text()").extract()[0]
item['type']=response.xpath("//div[@id='info']//span[@property='v:genre']/text()").extract()
# country=response.xpath("//div[@id='info']").extract()
#for coun in country:
# item['country']=country
# print "-----------"
# print type(country)
# print len(country)
item['date']=response.xpath("//div[@id='info']//span[@property='v:initialReleaseDate']/text()").extract()
item['length']=response.xpath("//div[@id='info']//span[@property='v:runtime']/text()").extract()
item['grade']=response.xpath("//strong/text()").extract()
start=response.xpath("//div[@class='ratings-on-weight']/div/span/text()").extract()
list=[]
for it in start:
list.append(it.replace("\n", "").replace(" ",""))
item['startrate']=list
# 链接
item['url'] = response.url
yield scrapy.Request(item['url']+"comments?status=P", callback=self.parse_item2)
#print item
##yield item

def parse_item2(self, response):
item2 = MovieContent()
url3 =response.xpath("//div[@class='comment-item']//div[@class='avatar']/a/@href").extract()
eachs=response.xpath("//div[@id='comments']")
for p in eachs:
item2['title'] = p.xpath("//title/text()").extract()[0]
item2['people']=p.xpath(".//span[@class='comment-info']//a/text()").extract()[0]
item2['grade']=p.xpath(".//span[@class='comment-info']//span[2]/@title").extract()[0]

if item2['grade'].find("-")>0:
item2['grade']='nothing'
##yield item2
url5 = response.xpath("//div[@id='comments']//div[@class='avatar']/a/@href").extract()
u=str(url5)
urlurl=u.split(",")
for uuu in urlurl:
uuu=uuu.replace("['",'')
uuu=uuu.replace("]", '')
uuu=uuu.replace("\'", '')
uuu = uuu.replace(" ", '')
uuu = uuu[1:]
print "hhhhhhhhhhhhhhhhhhhhh"
print uuu
yield scrapy.Request(uuu, callback=self.parse_item3)
# "//div[@id='comments']//div[@class='avatar']/a/@href"

#yield scrapy.Request(url3, callback=self.parse_item3)
# for url4 in url3:
# yield scrapy.Request(url4, callback=self.parse_item3)
#
def parse_item3(self,response):
print "3333333333333333333"
item3=MovieSelf()
item3['user']=response.xpath("//title/text()").extract()
item3['selfgrade'] = 'nothing'#response.xpath("//div[@id='comments']//span[@class='comment-info']")
item3['movie'] = response.xpath("//div[@id='movie']//div[@class='obssin'][2]/ul/li/a/@title").extract()
yield item3

管道文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
class NewdoubanPipeline(object):

count = 0
c2=c=0
def __init__(self):
self.filename = open("douban.json", "w")
self.filename2 = open("doubancomment.json", "w")
self.filename3 = open("doubankanguo.json", "w")
# import codecs
# self.filename=codecs.open("dongguan.json","w",encoding="utf-8")

def process_item(self, item, spider):
item2=dict(item)
if item2.has_key('people'):
self.count += 1
text = str(self.count) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename.write(text.encode("utf-8"))
elif item2.has_key('type'):
self.c2 += 1
text = str(self.c2) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename2.write(text.encode("utf-8"))
else:
self.c+= 1
text = str(self.c) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename3.write(text.encode("utf-8"))
return item

def close_spider(self, spider):
self.filename.close()
self.filename2.close()