一、数据爬取
# encoding = 'utf-8'
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_url_content(url):
r = requests.get(url)
soup_list = BeautifulSoup(r.text, 'lxml').find_all(class_='category-wrap_iQLoo')
all_content = []
for i in range(len(soup_list)):
index = soup_list[i].find(class_=f'index_1Ew5p c-index-bg{i+1}').get_text().replace(' ','')
number = soup_list[i].find(class_='hot-index_1Bl1a').get_text().replace(' ','')
name = soup_list[i].find(class_='c-single-text-ellipsis').get_text().replace(' ','')
type_actor_list = soup_list[i].find_all(class_='intro_1l0wp')
type_content = type_actor_list[0].get_text().replace(' ','')
actor = type_actor_list[1].get_text().replace(' ','')
desc = soup_list[i].find(class_='c-single-text-ellipsis desc_3CTjT').get_text().replace(' ','')
all_content.append((index, number, name, type_content, actor, desc))
return all_content
type_list = ['全部类型','爱情','喜剧','动作','剧情','科幻','恐怖','动画','惊悚','犯罪']
region_list = ['全部地区', '中国大陆', '中国香港', '中国台湾', '欧美', '日本', '韩国']
all_content_df = pd.DataFrame()
for type_ in type_list:
for region in region_list:
base_url = 'https://top.baidu.com/board?platform=pc&tab=movie&tag={"category":{' + type_ + '},"country":{' + region + '}}'
per_url_content = get_url_content(base_url)
per_content_df = pd.DataFrame(per_url_content)
per_content_df.columns = ['排名', '热搜指数', '电影名', '类型', '演员', '简介']
per_content_df['整体类