python爬虫7篇实例,分七个文章进行发布;第四篇:爬取豆瓣电影。
- 先上完整代码: 代码后面进行一步步解析。
import requests
from bs4 import BeautifulSoup
n = list(range(0,250,25))
urls = []
for i in n:
s = f'https://movie.douban.com/top250?start={i}&filter='
urls.append(s)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Accept-Encoding':'gizp,deflate',
'Accept':'*/*'
}
def parse_single_html(html):
soup = BeautifulSoup(html.text, 'html.parser')
article_items = (soup.find('div',class_='article')
.find('ol',class_='grid_view')
.find_all('div',class_='item'))
datas = []
for article_item in article_items:
rank = article_item.find('div',class_='pic').find('em').get_text()
title = article_item.find('div',class_='info').find('div',class_='hd').find('span',class_='title').get_text()
stars = article_item.find('div',class_='bd').find('div',class_='star').find_all('span')
rating_star = stars[0]['class'][0].replace('rating','').replace('-t','')
rating_num = stars[1].get_text()
comments = stars[3].get_text().replace('人评价','')
# print(stars)
# print(f'{rank}\t{title}\t{rating_star}\t{rating_num}\t{comments}')
datas.append({
"rank":rank,
"title":title,
"rating_num":rating_num,
"rating_star":rating_star,
"comments":comments
})
return datas
data = []
for i in range(10):
url = urls[i]
r = requests.get(url, headers=headers)
if r.status_code != 200:
print('error!')
# print(url)
print(parse_single_html(r))
print('*'*100)
data_temp = parse_single_html(r)
for temp in data_temp:
data.append(temp)
print(data)
import pandas as pd
df = pd.DataFrame(data)
df.to_excel('电影数据.xlsx')
1、引入包 及构建链接,创建头文件
- 这里的链接也需要自己构建,所以需要有个 list 构建链接的地址信息
import requests
from bs4 import BeautifulSoup
n = list(range(0,250,25))
urls = []
for i in n:
s = f'https://movie.douban.com/top250?start={i}&filter='
urls.append(s)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Accept-Encoding':'gizp,deflate',
'Accept':'*/*'
}
2、解析单个网页,获取电影的数据
- 先根据获得的网页数据,根据网页信息,找到 article_items 信息
- 再循环 article_items 信息,得到电影的 排名、文章标题、评论数、等级等信息
def parse_single_html(html):
soup = BeautifulSoup(html.text, 'html.parser')
article_items = (soup.find('div',class_='article')
.find('ol',class_='grid_view')
.find_all('div',class_='item'))
datas = []
for article_item in article_items:
rank = article_item.find('div',class_='pic').find('em').get_text()
title = article_item.find('div',class_='info').find('div',class_='hd').find('span',class_='title').get_text()
stars = article_item.find('div',class_='bd').find('div',class_='star').find_all('span')
rating_star = stars[0]['class'][0].replace('rating','').replace('-t','')
rating_num = stars[1].get_text()
comments = stars[3].get_text().replace('人评价','')
# print(stars)
# print(f'{rank}\t{title}\t{rating_star}\t{rating_num}\t{comments}')
datas.append({
"rank":rank,
"title":title,
"rating_num":rating_num,
"rating_star":rating_star,
"comments":comments
})
return datas
3、循环网址,得到每个网址下的电影信息。
- 将第一步的网址循环,然后获得每个网页信息
data = []
for i in range(10):
url = urls[i]
r = requests.get(url, headers=headers)
if r.status_code != 200:
print('error!')
# print(url)
print(parse_single_html(r))
print('*'*100)
data_temp = parse_single_html(r)
for temp in data_temp:
data.append(temp)
4、循环网址,得到每个网址下的电影信息。
- 将获得的数据写入表格里面
import pandas as pd
df = pd.DataFrame(data)
df.to_excel('电影数据.xlsx')