#需求:对top250进行requests爬取,并清洗数据后制作柱状图,折线图等 #定义函数。爬取top250的信息 #影片详情链接、图片链接、、影片概况和相关内容 from bs4 import BeautifulSoup import requests import re import openpyxl import time def get_data(): wb = openpyxl.Workbook() sheet = wb.active sheet.append(['中文标题', '英文标题', '发行时间', '电影评分', '评价人数','影片详细链接','图片链接','影片概况','导演名称']) wb.save('data.xlsx') for m in range(0,10): print(f'正在爬取第{m+1}页内容') nus=m*25 URL=f"https://movie.douban.com/top250?start={nus}&filter="#这里的网址并未变化-第一页网址 Headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; an-ES; rv:1.9.0.20) Gecko/9663-05-06 15:44:33 Firefox/3.8'} response = requests.get(url=URL, headers=Headers) if response.status_code!=200: print(f"爬取失败,状态码是{response.status_code}") else: response.encoding = 'utf-8' soup=BeautifulSoup(response.text,'html.parser') label_numbers=soup.select('#content>div>div.article>ol>li') for label_number in range(1,len(label_numbers)+1): a1234=f'#content > div > div.article > ol > li:nth-child({label_number}) ' #中文标题 label1=soup.select(a1234+'> div > div.info > div.hd > a > span:nth-child(1)') #英文标题 label2 = soup.select(a1234+ f'> div > div.info > div.hd > a > span:nth-child(2)') english_label=label2[0].text.replace("/", "").strip() #导演,主演,发行时间,发行地区,剧情归属标签需要二次处理#发行时间很好切出来,其他的没法切 label4 = soup.select(a1234+'> div > div.info > div.bd > p:nth-child(1)') # 发行时间 start_time=re.findall(r'\d{4}',label4[0].text) #电影评分 label5 = soup.select(f'#content > div > div.article > ol > li:nth-child({label_number}) > div > div.info > div.bd > div > span.rating_num') #评价人数 label6 = soup.select(f'#content > div > div.article > ol > li:nth-child({label_number}) > div > div.info > div.bd > div > span:nth-child(4)') man_number=label6[0].text.rstrip('人评价') #影片详细链接 label7 = soup.select(f'#content > div > div.article > ol > li:nth-child({label_number}) > div > div.info > div.hd > a') #图片链接 label8 = soup.select(f'#content > div > div.article > ol > li:nth-child({label_number}) > div > div.pic > a > img') #详细内容 label10 = soup.select( f'#content > div > div.article > ol > li:nth-child({label_number}) > div > div.info > div.bd > p:nth-child(1)') #对详细内容进行清洗,筛选出导演中文名 pattern = r'导演: ([\u4e00-\u9fa5]+)' directors = re.findall(pattern, label10[0].text) if directors==[]: director='空' else: director=directors[0] # #写入元素 write_excel(label1[0].text,english_label, start_time[0],label5[0].text,man_number ,label7[0]['href'],label8[0]['src'],label10[0].text,director) time.sleep(1) def write_excel(a,b,c,d,e,f,g,h,i): wb=openpyxl.load_workbook('data.xlsx') sheet=wb.active sheet.append([a,b,c,d,e,f,g,h,i]) wb.save('data.xlsx') if __name__ == '__main__': get_data()
import openpyxl import pandas as pd import matplotlib.pyplot as plt wb=openpyxl.load_workbook('data.xlsx') worksheet = wb.active max_row = worksheet.max_row def get_max_column_with_data(worksheet):#获取excel中最大列数 max_column = 0 for col in worksheet.columns: if any(cell.value for cell in col): max_column = col[0].column return max_column max_column = get_max_column_with_data(worksheet)#获取excel中最大行数 dates=[] scores=worksheet['D'] for score in scores: dates.append(score.value) dates.remove(dates[0]) dotes=[] for datrs in dates: dotes.append(float(datrs)) rows_to_delete = [] for row in worksheet.iter_rows(min_row=1, max_row=251, min_col=1, max_col=9): for cell in row: if cell.value == '空': rows_to_delete.append(cell.row) break for row_number in reversed(rows_to_delete): worksheet.delete_rows(row_number) wb.save('clean.xlsx') print('经过爬行策略知,当‘导演’为则填入‘空’,删除excel中有‘空’的横行') print('保存为clean.xlsx') df1=pd.read_excel("clean.xlsx",sheet_name='Sheet',usecols=[0,2,3,4]) print(' 数据的最大值最小值平均数方差记录数等') print(df1.describe()) print('数据的空值的数量') print(df1.isnull().sum()) class MyClass: def __init__(self): self.value = 0 def turtle1(self): #柱形图1 plt.rcParams['font.sans-serif'] = 'SimHei' df = pd.read_excel('clean.xlsx', sheet_name='Sheet') mq=df.head(11) x = mq['中文标题'] y = mq['电影评分'] plt.figure(figsize=(10,3), dpi=150) plt.title('top250前十电影评分202209446') plt.xlabel('中文标题') plt.ylabel('电影评分') plt.bar(x, y, width=0.6) for a, b in zip(x, y): plt.text(x=a, y=b, s=b, color='black',fontsize=8, ha='center') plt.xticks(range(len(x))[::2]) plt.legend(['top250评分202209446']) plt.show() def turtle2(self): #柱形图2 plt.rcParams['font.sans-serif'] = 'SimHei' x = ['陈凯歌','宫崎骏','克里斯托夫','加布里尔'] y = [1,8,1,1] plt.figure(figsize=(10, 3), dpi=150) plt.title('202209446导演和作品数量') plt.xlabel('导演') plt.ylabel('数量') plt.bar(x, y, width=0.6) for a, b in zip(x, y): plt.text(x=a, y=b, s=b, color='black', fontsize=6, ha='center') plt.xticks(range(len(x))) plt.legend(['导演—数量202209446']) plt.show() def turtle3(self): # #趋势图 plt.rcParams['font.sans-serif'] = 'SimHei' df = pd.read_excel('clean.xlsx', sheet_name='Sheet') mq=df.head(10) x = mq['中文标题'] y = mq['电影评分'] plt.figure(figsize=(5, 3), dpi=150) plt.title('202209446前十的电影评分') plt.stackplot(x, y) plt.xticks([]) cellText = mq[['中文标题']].T.values.tolist() rowLabels = ['中文标题'] colLabels = mq['电影评分'].values.tolist() plt.table(cellText=cellText, rowLabels=rowLabels, colLabels=colLabels) plt.show() def turtle4(self): # #趋势图 plt.rcParams['font.sans-serif'] = 'SimHei' df = pd.read_excel('clean.xlsx', sheet_name='Sheet') plt.figure(figsize=(5, 3), dpi=150) mq=df.head(10) plt.pie(mq['电影评分'],labels= mq['中文标题']) plt.title('202209446gxj前十名对应评分占比情况') plt.show() def turtle5(self): plt.rcParams['font.sans-serif'] = 'SimHei' df = pd.read_excel('clean.xlsx', sheet_name='Sheet') mq=df.head(8) x = mq['中文标题'] y = mq['电影评分'] plt.figure(figsize=(5, 3), dpi=150) plt.xlabel('guoxignjia202209446') plt.ylabel('电影评分') plt.xticks(fontsize=6) plt.scatter(x, y) for a, b in zip(x, y): plt.text(x=a, y=b, s=b, color='black', fontsize=8, ha='center') plt.show() if __name__ == '__main__': my_instance = MyClass() my_instance.turtle1() my_instance.turtle2() my_instance.turtle3() my_instance.turtle4() my_instance.turtle5()
转载刘二爱喝胡辣汤