简单爬取代码+将数据存为表格
import urllib
from bs4 import BeautifulSoup
import requests
import re
import xlwt
def main():
baseurl="https://movie.douban.com/top250?start="
dataList=getData(baseurl)
savepath="豆瓣电影Top250.xls"
saveData(dataList,savepath)
findLink=re.compile(r'<a href="(.*?)">') #运用正则表达式规则来筛选a标签中的地址
findImgSrc=re.compile(r'<img.*src="(.*?)"',re.S) #筛选img标签中的图片地址(re.S表示包含换行符)
findTitle=re.compile(r'<span class="title">(.*)</span>') #影片名称
findRating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') #影片评分......
findJudge=re.compile(r'<span>(\d*)人评价</span>') #评价人数
findInq=re.compile(r'<span class="inq">(.*)</span>') #电影简评
findBd=re.compile(r'<p class="">(.*?)</p>',re.S) #导演,演员表
def getData(baseurl): #爬取网页
dataList = []
for i in range(0,25): #循环爬取
url=baseurl+str(i*25)
html=askUrl(url)
#解析数据
soup =BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #通过bs讲网页拆分为小的文档
data = []
item = str(item)
Link = re.findall(findLink, item)[0] # 通过re库查找指定字符串
data.append(Link)
ImgSrc = re.findall(findImgSrc, item)[0]
data.append(ImgSrc)
Title = re.findall(findTitle, item)[0]
data.append(Title)
Rating = re.findall(findRating, item)[0]
data.append(Rating)
Judge = re.findall(findJudge, item)[0]
data.append(Judge)
Inq = re.findall(findInq, item)
if len(Inq) != 0:
Inq = Inq[0].replace("。", "") # 去掉句号
data.append(Inq)
else:
data.append(" ") # 留空
Bd = re.findall(findBd, item)[0]
Bd = re.sub('<br(\s+)?/>(\s+)?', " ", Bd) # 去掉<br>
Bd = re.sub('/', " ", Bd.strip())
data.append(Bd)
str_list = Bd.split() #化成列表后重新分割
num = 0
i = 0
for item in str_list:
if item.isdigit():
num = i
i = i + 1
for j in range(2):
data.append(str_list[num])
num=num+1
data.append(str_list[i-1])
dataList.append(data)
# for item in dataList: #输出
# print(item)
return dataList
def askUrl(url): #发送请求
head={
"user-agent": "" #填自己的
}
html=""
request=urllib.request.Request(url,headers=head)
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData(dataList, savepath):
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创立workbook对象
sheet = book.add_sheet('豆瓣电影Top',cell_overwrite_ok=True) #创立工作表
col=("豆瓣链接","图片链接","影片名","评分","评分人数","简评","演员表","年份","国家/地区","类型")
data=[]
for i in range(0,10):
sheet.write(0,i,col[i]) #表格第一行标题
for i in range(0,250):
data=dataList[i]
for j in range(0,10):
sheet.write(i+1,j,data[j])
book.save(savepath)
if __name__=="__main__":
main()
有的地方方法很笨重,欢迎指出。