之前爬取电影信息的时候,将电影短评的url一并存起来了。
因此爬取电影短评的时候只需将数据库中存在的url 放入start_urls中就好了。
spider.py
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.spiders import Spider
from scrapy.http import Request ,FormRequest
from comments.items import CommentsItem
import scrapy
from scrapy import log
import MySQLdb
class CommentSpider(Spider):
name = "comments"
#allowed_domains=["movie.douban.com"]
db = MySQLdb.connect("localhost","root","123456","python" )
cursor = db.cursor()
#在爬取电影信息时已经将评论的链接也抓到数据库中(comment_url), 从数据中找到地址 作为 start_urls
cursor.execute("select comment_url from doubanmovie")
#data = cursor.fetchone() # 取一条
data = cursor.fetchall() #取所有
start_urls = data
def parse(self,response):
sel = Selector(text=response.body)
Url = response.url
start_index = Url.find('comments')
URL = Url[0:start_index+8]
ID = filter(str.isdigit,URL)
comments = sel.xpath('//*[@class="comment-info"]')
for comment in comments:
item = CommentsItem()
item['ID'] = ID
item['user_name'] = comment.xpath('a/text()').extract()
item['user_score'] = comment.xpath('span[1]/@title').extract()
yield item
for url in sel.xpath("//*[@class='next']/@href").extract():
yield Request(URL+url,callback=self.parse)
运行的时候出错,
显示TypeError(‘Request url must be str or unicode, got %s:’ % type(url).name).
后来发现 是因为 从数据库中取数据,data是 tuple 格式。
直接 start_urls = data; 不合适。因为 在scrapy中 start_urls是List,而且 start_urls中的元素应该是string;
于是添加了一些代码:
temp = list(data)
start_urls = []