这个是我这几天学习爬取豆瓣电影Top250的代码(不是很完善,在慢慢凝练中!):
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
baseurl="https://movie.douban.com/top250?start="
datalist=getData(baseurl)
#askUrl("https://movie.douban.com/top250?start=")
savepath="C:\\python\\1"
saveData(datalist,savepath)
findLink=re.compile(r'<a href="(.*?)">')
findimgsrc=re.compile(r'img.*src="(.*?)"',re.S)
findtitle=re.compile(r'<span class="title">(.*)</span>')
findrating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findjudge=re.compile(r'<span>(\d*)人评价</span>')
findinq=re.compile(r'<span class="inq">(.*)</span>')
findbd=re.compile(r'<p class="">(.*?)</p>',re.S)
def askUrl(url):
head={
"User-Agent": "(这里需要你电脑上的用户信息,随便找一个网页,按f12后,点击网络就会看到User-Agent,把这个User-Agent的信息复制到这里就可以了。)"
}
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8") #我的电脑是“utf-8”的
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def getData(baseurl):
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askUrl(url)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all("div",class_="item"):
data=[]
item=str(item)
link=re.findall(findLink,item)[0]
data.append(link)
imgsrc=re.findall(findimgsrc,item)[0]
data.append(imgsrc)
title=re.findall(findtitle,item)
if (len(title)==2):
ctitle=title[0]
data.append(ctitle)
otitle=title[1].replace("/","")
data.append(otitle)
else:
data.append(title[0])
data.append(" ")
rating=re.findall(findrating,item)[0]
data.append(rating)
judge=re.findall(findjudge,item)[0]
data.append(judge)
inq=re.findall(findinq,item)
if len(inq)!=0:
inq=inq[0].replace("。","")
data.append(inq)
else:
data.append(" ")
bd=re.findall(findbd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?'," ",bd)
ba=re.sub('/'," ",bd)
data.append(bd.strip())
datalist.append(data)
#print(datalist) #(这个是看看打印出来是什么样子的)
return datalist
def saveData(datalist,savepath):
book=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet=book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col=("电影详情链接","图片链接","影片中文名","影片外文名","评分","评价人数","概况","相关信息")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
print("第%d条"%(i+1)) #时刻关注运行到第几条了。其实运行很快的,没啥用。
data=datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save("豆瓣电影Top250.xls")
if __name__=="__main__":
main()
print("爬取完毕!")