刚开始学爬虫,根据Python爬虫编程基础5天速成(2021全新合集)Python入门+数据分析_哔哩哔哩_bilibili学习,自己简单敲了一遍:
import urllib.request
import urllib.error
import re
from bs4 import BeautifulSoup
import xlwt
baseurl = 'https://movie.douban.com/top250?start='
findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象
findImgSrc = re.compile(r'<img .*src="(.*?)"',re.S) #忽略换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*?)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)
# 1.爬取网页
def askHtml(url):
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
request = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
return html
# 2.解析数据
def get_Datalist():
dataList = []
for i in range(0,10): #左闭右开
url = baseurl + str(i * 25)
html = askHtml(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup.find_all('div',class_='item'):
data = []
item = str(item)
# 影片详情链接
link = re.findall(findLink,item)[0]
data.append(link)
# 图片链接
img =re.findall(findImgSrc,item)[0]
data.append(img)
# 名字
titles = re.findall(findTitle,item)
if len(titles) == 2:
ctitle = titles[0] #添加中文名
data.append(ctitle)
otitle = titles[1].replace('/','') #添加外文名
data.append(otitle)
else:
data.append(titles[0])
data.append(' ') #外文名留空
# 评分
rating = re.findall(findRating,item)[0]
data.append(rating)
# 评分人数
judge = re.findall(findJudge,item)[0]
data.append(judge)
# 概述
inq = re.findall(findInq,item)
if len(inq) != 0:
inq = inq[0].replace('。','')
data.append(inq)
else:
data.append(' ') #留空
bd = re.findall(findBd,item)[0]
bd =re.sub('<br(\s+)?/>(\s+)?',' ',bd)
bd = re.sub('/',' ',bd)
data.append(bd.strip()) #去掉前后空格
dataList.append(data)
return dataList
def saveData(dataList):
workbook = xlwt.Workbook(encoding='utf-8',style_compression=0)
worksheet = workbook.add_sheet('sheet1',cell_overwrite_ok=True)
col = ('电影详情链接','图片链接','影片中文名','外文名','评分','评价数','概况','相关信息')
for i in range(0,8):
worksheet.write(0,i,col[i]) #列名
for i in range(0,250):
for j in range(0,8):
worksheet.write(i + 1 ,j,dataList[i][j])
workbook.save('豆瓣电影.xls')
dataList = get_Datalist()
saveData(dataList)