前言
有个小需求,要生成一个excel,自定义表头那种,忽然想起原来写的扒拉豆瓣电影的脚本。贴上,一看就懂。今天试了一下,发现原来的脚本需要加上header,设置User-Agent,否则读出来页面是空。
用的 pandas ,除了install 开头的三个模块,还需要insatll openpyxl(pandas必需)、lxml(我代码必需)
代码
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
def get_data(url):
print(url)
html = rq.get(url, headers=header).content.decode("utf-8")
soup = BeautifulSoup(html,"lxml")
divItems = soup.find_all("div","item")
for item in divItems:
name = item.find('span',{'class':'title'}).text
rating_num = item.find('span',{'class':'rating_num'}).text
number = item.find('em').text
imgUrl = item.find("img").attrs['src']
print("----------->")
print("number:"+number)
numbers.append(number)
print("name:"+name)
names.append(name)
print("rating_num:"+rating_num)
rating_nums.append(rating_num)
print("imgUrl:"+imgUrl)
imgUrls.append(imgUrl)
print("----------->")
data_df = pd.DataFrame()
numbers = []
names = []
rating_nums = []
imgUrls = []
for num in range(0,10):
get_data("https://movie.douban.com/top250?start={}&filter=".format(int(num)*25))
data_df["序列"] = numbers
data_df["名称"] = names
data_df["评分"] = rating_nums
data_df["海报"] = imgUrls
writer = pd.ExcelWriter('movie_top250.xlsx')
data_df.to_excel(writer, 'page_1', float_format='%.5f') # float_format 控制精度
writer.save()