注意:仅供参考,要是用来交作业的话,部分内容得改一下。
db.py:
import scrapy
import json
from ..items import DoubanItem
class DbSpider(scrapy.Spider):
name = "db"
allowed_domains = ["https://movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
node_list = response.xpath('//div[@class = "info"]')
page_num = 0
if node_list:
for node in node_list:
movie_name = node.xpath('.//div[@class = "hd"]/a/span/text()').get()
director = node.xpath('.//div[@class = "bd"]/p/text()').get().strip()
score = node.xpath('.//span[@class = "rating_num"]/text()').get()
description = node.xpath('.//p[@class = "quote"]/span/text()').get()
item = {}
item['movie_name'] = movie_name
item['director'] = director
item['score'] = score
item['description'] = description
yield item
page_num += 1
if page_num <= 2:
page_url = "https://movie.douban.com/top250?start={}&filter=".format(page_num * 25)
yield scrapy.Request(page_url,callback=self.parse)
else:
return
# https://movie.douban.com/top250?start=25&filter=
# https://movie.douban.com/top250?start=50&filter=
# https://movie.douban.com/top250?start=75&filter=
items.py :
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movie_name = scrapy.Field()
director = scrapy.Field()
score = scrapy.Field()
desc = scrapy.Field()
# https://www.douban.com/mdrama/rank?t=1&p=1
# https://www.douban.com/mdrama/rank?t=1&p=2
# https://www.douban.com/mdrama/rank?t=1&p=3
main.py:
import os.path
import sys
from scrapy.cmdline import execute
currentFile = os.path.abspath(__file__)
currentPath = os.path.dirname(currentFile)
# print(currentPath)
sys.path.append(currentPath)
execute(["scrapy","crawl","db"])
pipelines.py:
import pymysql
import json
class DoubanPipeline:
def open_spider(self,spider):
self.f = open('maoer1.json','w',encoding='utf-8')
def process_item(self, item, spider):
json_str = json.dumps(dict(item),ensure_ascii=False) + '\n'
self.f.write(json_str)
return item
def close_spider(self,spider):
self.f.close()
转存至MySQL中:
import mysql.connector
import json
conn = mysql.connector.connect(
host="127.0.0.1",
user="root",
password="010208",
database="spider",
port = 3306,
charset = "utf8"
)
cursor = conn.cursor()
with open('maoer1.json', 'r') as file:
data = json.load(file)
for entry in data:
description = entry.get('description', '') # 确保title字段存在
movie_name = entry.get('movie_name', '')
director = entry.get('director', '')
score = entry.get('score', '')
sql = "INSERT INTO spider10 (description,movie_name,director,score) VALUES (%s,%s,%s,%s)"
cursor.execute(sql, (description,movie_name,director,score))
conn.commit()
cursor.close()
conn.close()
可视化:
import json
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud
import jieba
with open('./maoer1.json', 'r', encoding='utf-8') as file:
movies = [json.loads(line) for line in file]
df = pd.DataFrame(movies)
font_path = './NotoSerifSC-Black.ttf'
font_prop = FontProperties(fname=font_path)
plt.rcParams['font.family'] = font_prop.get_name()
plt.figure(figsize=(10, 6))
plt.barh(df['movie_name'], df['score'].astype(float), color='skyblue')
plt.xlabel('评分',fontproperties=font_prop)
plt.ylabel('电影名称',fontproperties=font_prop)
plt.title('电影评分柱状图',fontproperties=font_prop)
plt.yticks(fontproperties=font_prop)
plt.gca().invert_yaxis()
plt.show()
text = ' '.join(df['description'])
wordlist = jieba.cut(text, cut_all=False)
wl_space_split = " ".join(wordlist)
wordcloud = WordCloud(
font_path=font_path,
width=800,
height=400,
background_color='white'
).generate(wl_space_split)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('电影描述词云',fontproperties=font_prop)
plt.show()
搞定,这三篇结合在一起就是我的期末作业了(我早就考了,成绩还是可以滴),感兴趣的朋友可以看看,多多指教。其实我也知道有更简单的方法,但是,懒癌犯了,不想写。心血来潮写了这么多,就这样吧。
关于此篇文章,如果有疑问,欢迎随时交流学习,只要我在线,肯定回。