此篇文章承接(猫眼电影-爬取)。
将电影数据储存到MySQL中后,发现评论人数和票房的数据当中存在汉字,后期不好分析,所以需要将汉字转化为数值。
保险起见,我先将films表里面的结构和数据复制了成了一个新表films_copy,然后新增了2列,people和box_price。
将数据转化为便于分析的数据,代码如下:
import pymysql
def get_data():
data1 = []
data2 = []
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql = "SELECT score_hum,box_office FROM films_copy"
try:
cursor.execute(sql)
results = cursor.fetchall()
for item in results:
if '万' in item[0]:
change0 = item[0]
change1 = int(float(item[0].replace('万', '')) * 10000)
data1.append((change1, change0))
else:
change0 = item[0]
change1 = item[0]
data1.append((change1, change0))
if '万' in item[1]:
if '美元' in item[1]:
change0 = item[1]
change2 = int(float(item[1].replace('万美元', '')) * 10000 * 6.8)
data2.append((change2, change0))
else:
change0 = item[1]
change2 = int(float(item[1].replace('万', '')) * 10000)
data2.append((change2, change0))
elif '亿' in item[1]:
if '美元' in item[1]:
change0 = item[1]
change2 = int(float(item[1].replace('亿美元', '')) * 100000000 * 6.8)
data2.append((change2, change0))
else:
change0 = item[1]
change2 = int(float(item[1].replace('亿', '')) * 100000000)
data2.append((change2, change0))
else:
change0 = item[1]
data2.append((item[1], change0))
except:
print("something wrong")
db.close()
return data1,data2
def change_hum(data1)
for i in range(len(data1)):
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql1 = "UPDATE films_copy SET people = '%s' WHERE score_hum = '%s'" %(data1[i][0], data1[i][1])
print(data1[i][0], data1[i][1])
try:
if cursor.execute(sql1):
print('Successful')
db.commit()
except:
db.rollback()
print('Falied')
db.close()
def change_prices(data2):
for i in range(len(data2)):
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql2 = "UPDATE films_copy SET box_price = '%s' WHERE box_office = '%s'" %(data2[i][0], data2[i][1])
try:
if cursor.execute(sql2):
print('Successful')
db.commit()
except:
db.rollback()
print('Falied')
db.close()
def main():
data1 = get_data()[0]
data2 = get_data()[1]
change_hum(data1)
change_prices(data2)
if '__name__' == '__main__':
main()
现在开始分析数据:(按照公众号作者的代码操作,一直没有出现作者文章中的效果,所以找解决方法找了很久)
1,2018年电影评分TOP10
from pyecharts import Bar
import pandas as pd
import numpy as np
import pymysql
co