使用pyspider爬取豆瓣评论
-*- encoding: utf-8 -*-
Created on 2018-12-04 20:30:39
Project: douban
from pyspider.libs.base_handler import *
import MySQLdb
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/63.0'}
for i in range(0,250,25):
self.crawl('https://movie.douban.com/top250?start='+str(i)+'&filter=', callback=self.index_page, validate_cert=False, headers=headers)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/63.0'}
for each in response.doc('.item .pic a').items():
for i in range(0,200,20):
self.crawl(each.attr.href+'comments?start='+str(i)+'&limit=20&sort=new_score&status=P', callback=self.detail_page, headers=headers, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
db = MySQLdb.connect(host = '127.0.0.1',user = 'root',passwd = 'zgx675050748',db = 'python',charset = 'utf8')
cursor = db.cursor()
names = response.doc('#content h1')
for name in names:
print name.text
for content in response.doc('.short'):
print content.text
sql = "INSERT INTO video1(电影名,评论)VALUES ('%s','%s')"%(name.text,content.text)
cursor.execute(sql)
db.commit()
db.close()