这2天突然想学python,就来一波吧!!
主要用于学习!!!
搞了什么
爬取豆瓣Top250电影
准备
- 需要你学习几个相关的库
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配
import urllib.request, urllib.error # 定制url,获取网页数据
import xlwt # 进行Excel操作
主要的还是正表达式
看看要哪几步操作?
def main():
baseUrl = "https://movie.douban.com/top250?start="
# 捉取网页
dataList = getData(baseUrl)
# print(dataList)
savePath = ".\\豆瓣电影Top250.xls"
# 保存数据
saveData(dataList,savePath)
if __name__ == "__main__":
main()
捉取网页
伪造header
def askHtml(url):
# 伪造header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=headers)
try:
response = urllib.request.urlopen(req)
return response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
解析页面
<li>
<div class="item">
<div class="pic">
<em class="">1</em>
<a href="https://movie.douban.com/subject/1292052/">
<img width="100" alt="肖申克的救赎" src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
</a>
</div>
<div class="info">
<div class="hd">
<a href="https://movie.douban.com/subject/1292052/" class="">
<span class="title">肖申克的救赎</span>
<span class="title"> / The Shawshank Redemption</span>
<span class="other"> / 月黑高飞(港) / 刺激1995(台)</span>
</a>
<span class="playable">[可播放]</span>
</div>
<div class="bd">
<p class="">
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
1994 / 美国 / 犯罪 剧情
</p>
<div class="star">
<span class="rating5-t"></span>
<span class="rating_num" property="v:average">9.7</span>
<span property="v:best" content="10.0"></span>
<span>2146145人评价</span>
</div>
<p class="quote">
<span class="inq">希望让人自由。</span>
</p>
</div>
</div>
</div>
</li>
你要先分析页面的元素特征,在进一步用正则表达式匹配内容~
正则表达式匹配
# 正则表达式
findLink = re.compile(r'<a href="(.*?)">')
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S 让换行符包含在字符中
findTitle = re.compile(r'<span class="title">(.*)</span>')
findRead = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>')
findContext = re.compile(r'<p class="">(.*?)</p>', re.S)
# 捉取网页
def getData(baseUrl):
dataList = []
for i in range(10):
url = baseUrl + str(i * 25)
html = askHtml(url)
# print(html)
# 解析网页
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
data = [] # 保存一部电影的信息
item = str(item)
# print(item)
# break
link = re.findall(findLink, item)[0]
src = re.findall(findImgSrc, item)[0]
title = re.findall(findTitle, item)
if len(title) == 2:
ti1 = title[0]
data.append(ti1)
ti2 = title[1].replace("/", "")
data.append(ti2)
else:
data.append(title)
data.append(" ")
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
else:
inq = ""
judge = re.findall(findJudge, item)[0]
context = re.findall(findContext, item)[0]
context = re.sub('<br(\s+)?/>(\s+)?', " ", context) # 去掉<br/>
context = re.sub('/', " ", context) # 替换/
read = re.findall(findRead, item)[0]
data.append(link)
data.append(src)
data.append(inq)
data.append(judge)
data.append(context)
data.append(read)
print(data)
# print(len(data))
dataList.append(data)
# break
return dataList
保存数据
将数据保存到Excel中
# 保存数据
def saveData(dataList,savePath):
print("save..")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('豆瓣电影Top250',cell_overwrite_ok=True)
col = ('名称','英文名称','电影详情链接','图片链接','概述','评分数','相关信息','评分')
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
data = dataList[i]
print(len(data))
print(data)
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save(savePath)
print("save over .. .")