如果爬取豆瓣电影时,IP被封,就连接其他wifi爬,比如手机热点。好可以使用付费代理IP
爬取豆瓣电影中:华语、欧美、韩国、日本电影每个标签下按评价排序的全部电影。需要如下信息:
(1)每个电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长,电影评分,以及每个星级评分的百分比数据。
(2)每个电影热门点评中的前100个评分及其评分人。
(3)进入每个评分人的主页,爬取其看过的电影信息,以及对电影的评分。(少于300部则全部爬取,多于300部则仅爬取前300个)
用Scrapy框架写
items.py文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MovieInfo(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长,电影评分,以及每个星级评分的百分比数据。
name=scrapy.Field()
director=scrapy.Field()
adaptor=scrapy.Field()
rolename=scrapy.Field()
type=scrapy.Field()
country=scrapy.Field()
date=scrapy.Field()
length=scrapy.Field()
grade=scrapy.Field()
startrate=scrapy.Field()
url=scrapy.Field()
class MovieContent(scrapy.Item):
title=scrapy.Field()
people=scrapy.Field()
grade=scrapy.Field()
class MovieSelf(scrapy.Item):
user=scrapy.Field()
movie=scrapy.Field()
selfgrade=scrapy.Field()
douban.py文件
# -*- coding: utf-8 -*-
import scrapy
import json
from NewDouBan.items import MovieInfo,MovieContent,MovieSelf
from urllib import urlencode
import time
class NewDouBan(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
data = {
'type': 'movie',
'tag': '华语',
'sort': 'rank',
'page_limit': 20,
'page_start': 0
}
start_urls = ['https://movie.douban.com/j/search_subjects?' + urlencode(data)]
def parse(self, response):
data = json.loads(response.text)["subjects"]
for each in data:
url2=each['url']
yield scrapy.Request(url2, callback=self.parse_item)
# 处理每个帖子里
def parse_item(self, response):
item = MovieInfo()
# 标题
item['name'] =response.xpath("//h1/span[@property]/text()").extract()[0]
item['rolename']=response.xpath("//div[@id='info']//span[@class]/a[@rel='v:starring']/text()").extract()
# 编号
item['director'] = response.xpath("//div[@id='info']//span[@class='attrs']/a[@rel]/text()").extract()[0]
item['type']=response.xpath("//div[@id='info']//span[@property='v:genre']/text()").extract()
# country=response.xpath("//div[@id='info']").extract()
#for coun in country:
# item['country']=country
# print "-----------"
# print type(country)
# print len(country)
item['date']=response.xpath("//div[@id='info']//span[@property='v:initialReleaseDate']/text()").extract()
item['length']=response.xpath("//div[@id='info']//span[@property='v:runtime']/text()").extract()
item['grade']=response.xpath("//strong/text()").extract()
start=response.xpath("//div[@class='ratings-on-weight']/div/span/text()").extract()
list=[]
for it in start:
list.append(it.replace("\n", "").replace(" ",""))
item['startrate']=list
# 链接
item['url'] = response.url
yield scrapy.Request(item['url']+"comments?status=P", callback=self.parse_item2)
#print item
##yield item
def parse_item2(self, response):
item2 = MovieContent()
url3 =response.xpath("//div[@class='comment-item']//div[@class='avatar']/a/@href").extract()
eachs=response.xpath("//div[@id='comments']")
for p in eachs:
item2['title'] = p.xpath("//title/text()").extract()[0]
item2['people']=p.xpath(".//span[@class='comment-info']//a/text()").extract()[0]
item2['grade']=p.xpath(".//span[@class='comment-info']//span[2]/@title").extract()[0]
if item2['grade'].find("-")>0:
item2['grade']='nothing'
##yield item2
url5 = response.xpath("//div[@id='comments']//div[@class='avatar']/a/@href").extract()
u=str(url5)
urlurl=u.split(",")
for uuu in urlurl:
uuu=uuu.replace("['",'')
uuu=uuu.replace("]", '')
uuu=uuu.replace("\'", '')
uuu = uuu.replace(" ", '')
uuu = uuu[1:]
print "hhhhhhhhhhhhhhhhhhhhh"
print uuu
yield scrapy.Request(uuu, callback=self.parse_item3)
# "//div[@id='comments']//div[@class='avatar']/a/@href"
#yield scrapy.Request(url3, callback=self.parse_item3)
# for url4 in url3:
# yield scrapy.Request(url4, callback=self.parse_item3)
#
def parse_item3(self,response):
print "3333333333333333333"
item3=MovieSelf()
item3['user']=response.xpath("//title/text()").extract()
item3['selfgrade'] = 'nothing'#response.xpath("//div[@id='comments']//span[@class='comment-info']")
item3['movie'] = response.xpath("//div[@id='movie']//div[@class='obssin'][2]/ul/li/a/@title").extract()
yield item3
管道文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class NewdoubanPipeline(object):
count = 0
c2=c=0
def __init__(self):
self.filename = open("douban.json", "w")
self.filename2 = open("doubancomment.json", "w")
self.filename3 = open("doubankanguo.json", "w")
# import codecs
# self.filename=codecs.open("dongguan.json","w",encoding="utf-8")
def process_item(self, item, spider):
item2=dict(item)
if item2.has_key('people'):
self.count += 1
text = str(self.count) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename.write(text.encode("utf-8"))
elif item2.has_key('type'):
self.c2 += 1
text = str(self.c2) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename2.write(text.encode("utf-8"))
else:
self.c+= 1
text = str(self.c) + json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename3.write(text.encode("utf-8"))
return item
def close_spider(self, spider):
self.filename.close()
self.filename2.close()