scrapy 豆瓣短评 数据分析 + 中文情感分析 + 可视化 (一)
一、scrapy 爬取 豆瓣短评
本次爬取的是哪吒之魔童降世 短评 。本次爬取的是静态网页还是蛮简单的。
1、开始地址
https://movie.douban.com/subject/26794435/comments?status=P
爬取的内容
item设置为
class DoubanscrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
table = 'douban'
name = scrapy.Field()
grade=scrapy.Field()
content=scrapy.Field()
time=scrapy.Field()
support_num=scrapy.Field()
pass
爬取
spider 为
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/subject/26794435/comments?status=P']
def __init__(self):
self.start_url='https://movie.douban.com/subject/26794435/comments?status=P'
self.next_url='https://movie.douban.com/subject/26794435/comments{next}'
def start_requests(self):
yield scrapy.Request(self.start_url, callback=self.get_parse)
def get_parse(self, response):
#print(response.body.decode('utf-8'))
contexts=response.xpath('//*[@class="comment-item"]')
for context in contexts :
item=DoubanscrapyItem()
item["name"] = context.xpath(".//@title").extract_first()
item["grade"] = context.xpath('.//*[@class="comment-info"]// span[2]/@title').extract_first()
item["time"] = context.xpath('.//*[@class="comment-info"]//*[@class="comment-time "]/@title').extract_first()
item["content"] = context.xpath('.//*[@class="short"]/text()').extract_first()
item["support_num"]= context.xpath('.//*[@class="votes"]/text()').extract_first()
yield item
next_page= context.xpath('//*[@id="paginator"]//*[@class="next"]/@href').extract_first()
if next_page is not None :
next= self.next_url.format(next=next_page)
yield scrapy.Request(next, callback=self.get_parse)
数据库的存储
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
print(data)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
爬取的数据如下