Python:利用爬虫爬取豆瓣电影Top250
import urllib
from bs4 import BeautifulSoup
import re
import openpyxl
def get_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"
}
request = urllib.request.Request(url, headers=headers)
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.HTTPError as e:
print(e.code, e.headers, e.reason)
except urllib.error.URLError as e:
print(e.reason)
return html
def base_url(baseUrl):
data = []
for i in range(0, 10):
url = baseUrl + str(i*25)
print(url)
html = get_url(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all("div", attrs={"class":"item"}):
filmUrl = item.find("a").get("href")
filmTitle = item.find("span", attrs={"class":"title"}).get_text()
filmBd = item.find("p", attrs={"class":""}).get_text()
filmBd = filmBd.strip().replace("\n", "")
if "\xa0" in filmBd:
filmBd = filmBd.replace("\xa0", "")
filmBd = re.sub(" ", "", filmBd)
if "/" in filmBd:
filmBd = re.sub("/", " ", filmBd)
if "..." in filmBd:
filmBd = filmBd.replace("..."," ")
filmGrade = re.findall(r'<span class="rating_num" property="v:average">(.*?)</span>', str(item))[0]
filmCommit = re.findall(r'<span>(.*?)</span>', str(item))[0]
filmInq = re.findall(r'<span class="inq">(.*)</span>', str(item))
if len(filmInq) == 0:
filmInq.append('此电影没有任何评价')
data.append([filmUrl, filmTitle, filmBd, filmGrade, filmCommit, filmInq[0]])
return data
def save_data(data):
datalist = []
wb = openpyxl.Workbook()
sheet = wb.create_sheet("豆瓣电影Top250")
sheet.append(("电影链接网址", "电影名字", "电影演员", "豆瓣评分", "评价人数", "电影概要"))
for i in range(0, 250):
datalist = data[i]
for j in range(0, 6):
sheet.cell(row=(i+2), column=(j+1), value=datalist[j])
wb.save("/home/aistudio/external-libraries/豆瓣电影Top250.xlsx")
def main():
print("开始爬取......")
url = "https://movie.douban.com/top250?start="
data = base_url(url)
save_data(data)
main()
print("爬取成功")
开始爬取......
https://movie.douban.com/top250?start=0
https://movie.douban.com/top250?start=25
https://movie.douban.com/top250?start=50
https://movie.douban.com/top250?start=75
https://movie.douban.com/top250?start=100
https://movie.douban.com/top250?start=125
https://movie.douban.com/top250?start=150
https://movie.douban.com/top250?start=175
https://movie.douban.com/top250?start=200
https://movie.douban.com/top250?start=225
爬取成功