Python 爬虫目录
1、Python3 爬取前程无忧招聘网 lxml+xpath
2、Python3 Mysql保存爬取的数据 正则
3、Python3 用requests 库 和 bs4 库 最新爬豆瓣电影Top250
4、Python Scrapy 爬取 前程无忧招聘网
5、持续更新…
Python3 用requests 库 和 bs4库 最新爬豆瓣电影Top250
用xlwt 库存储数据
爬取网站:https://movie.douban.com/top250?start=0&filter=
首先安装requests 和 BeautifulSoup库
本人用PyCharm 编写代码
具体代码和解释 在代码中我以描述
import requests
from bs4 import BeautifulSoup
import xlwt
def main():
baseurl = 'https://movie.douban.com/top250?start=0&filter='
Movie_data = get_data(baseurl)
savepath = "豆瓣电影top250.xls"
saveData(Movie_data,savepath,baseurl)
# print(Movie_data)
# 获取网站请求
def get_response(url):
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Host':'movie.douban.com',
'Referer':'https://movie.douban.com/top250?start=75&filter=',
}
try:
response = requests.get(url, headers=headers)
if response.status_code ==200:
return response.text
except requests.ConnectionError as e :
print('Error',e.args)
# 解析
def get_data(url):
movies = [] #存储一页电影信息
html = get_response(url)
soup= BeautifulSoup(html, 'html.parser')
actors_info = soup.find_all('div',attrs={'class':'bd'})[1:] #跳过第一个列表,从第二个开始
main_infos= soup.find_all('div',attrs={'class':'hd'})
for i,t in zip(actors_info,main_infos): #同时遍历两个列表:main_infos 和 actor_info
movie = {}
link = t.select('a')[0]
href = link['href'] # 影片链接
inf = list(i.stripped_strings)
# print(href)
infs = list(t.stripped_strings) #stripped_strings 用来获取所有的子孙非标签字符串,会自动去掉空白字符串
del(infs[-1]) # 删除列表最后一个元素
movieName = ''.join(infs) #用空字符链接列表
# print(movieName)
movie['movieName'] = movieName
#movie_actor = inf[o]
#movie_style = inf[1]
# movie_ratting = inf[2]
# movie_people = inf[3]
# movie_inq = inf[4],
movie['movie_actor'] = inf[0] # 影片导演
movie['movie_style'] = inf[1] # 影片类型
movie['movie_ratting'] = inf[2] # 影片评分
movie['movie_people'] = inf[3] # 影片评价人数
movie['movie_inq'] = inf[4] # 影片简介
movie['href'] = href # 影片链接
movies.append(movie)
# print(movies)
return movies
# # print(actors_info) for test
# get_data(baseurl) for test
# 保留数据至列表
def saveData(Movie_data,savepath,baseurl):
print('保存中…………………………')
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)
col = ('影片名','影片导演','影片类型','影片评分','影片评价人数','影片简介','影片链接')
data = get_data(baseurl)
for i in range(0,7):
sheet.write(0, i ,col[i])
for i ,item in enumerate(data):
sheet.write(i+1, 0, item['movieName'])
sheet.write(i+1, 1, item['movie_actor'])
sheet.write(i+1, 2, item['movie_style'])
sheet.write(i+1, 3, item['movie_ratting'])
sheet.write(i+1, 4, item['movie_people'])
sheet.write(i+1, 5, item['movie_inq'])
sheet.write(i+1, 6, item['href'])
# for i in range(0,25):
# print("第%d条:"%(i+1))
# data = Movie_data[i]
# for item in data :
# sheet.write(i+1, i, data[item])
book.save('豆瓣电影top250.xls')
if __name__=='__main__':
main()
print('保存成功')
# 以上代码只爬取一页信息,爬取25页代码还没写,想学了多进程,再开始编写,下面是爬取25页的网址格式
#爬取25页的网址
# num= 0
# for i in range(0,11):
# num = i*25
# url = 'https://movie.douban.com/top250?start' + str(num) + '&filter='
# print(url)
我用了xlwt 库进行了数据的存储,效果如下:
本文未采用 正则表达式,针对于想看bs4 提取的人, 我也是作为一个初学者,分享一下我所学的东西,欢迎交流