利用python爬虫爬取豆瓣电影Top250

Python:利用爬虫爬取豆瓣电影Top250

import urllib
from bs4 import BeautifulSoup
import re
import openpyxl
def get_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"
    }
    request = urllib.request.Request(url, headers=headers)
    # 异常处理
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        print(e.code, e.headers, e.reason)
    except urllib.error.URLError as e:
        print(e.reason)
    return html
def base_url(baseUrl):
    data = []
    for i in range(0, 10): # 爬取十次
        url = baseUrl + str(i*25)
        print(url)  # 打印每次的请求的url
        html = get_url(url)
        soup = BeautifulSoup(html, 'html.parser')
        for item in soup.find_all("div", attrs={"class":"item"}):
            filmUrl = item.find("a").get("href")
            filmTitle = item.find("span", attrs={"class":"title"}).get_text()
            filmBd = item.find("p", attrs={"class":""}).get_text()
            filmBd = filmBd.strip().replace("\n", "")  # 去掉两边空白,并用空格代替\n
            if "\xa0" in filmBd:  # 去除空格
                filmBd = filmBd.replace("\xa0", "")
            filmBd = re.sub(" ", "", filmBd)  # 去除中间空格
            if "/" in filmBd: # 去除/,并用空格替换
                filmBd = re.sub("/", " ", filmBd)
            if "..." in filmBd:  # 去除...,并用空格替换
                filmBd = filmBd.replace("..."," ")
            filmGrade = re.findall(r'<span class="rating_num" property="v:average">(.*?)</span>', str(item))[0]
            filmCommit = re.findall(r'<span>(.*?)</span>', str(item))[0]
            filmInq = re.findall(r'<span class="inq">(.*)</span>', str(item))
            if len(filmInq) == 0:  # 处理电影评价可能为空
                filmInq.append('此电影没有任何评价')
            data.append([filmUrl, filmTitle, filmBd, filmGrade, filmCommit, filmInq[0]])
    return data
# 保存数据 并写入excel表中
def save_data(data):
    datalist = []
    wb = openpyxl.Workbook()
    sheet = wb.create_sheet("豆瓣电影Top250")
    sheet.append(("电影链接网址", "电影名字",  "电影演员", "豆瓣评分", "评价人数", "电影概要"))
    for i in range(0, 250):
        datalist = data[i]
        for j in range(0, 6):
            sheet.cell(row=(i+2), column=(j+1), value=datalist[j]) 
    wb.save("/home/aistudio/external-libraries/豆瓣电影Top250.xlsx")
def main():
    print("开始爬取......")
    url = "https://movie.douban.com/top250?start="
    data = base_url(url)
    save_data(data)
main()

print("爬取成功")
开始爬取......
https://movie.douban.com/top250?start=0
https://movie.douban.com/top250?start=25
https://movie.douban.com/top250?start=50
https://movie.douban.com/top250?start=75
https://movie.douban.com/top250?start=100
https://movie.douban.com/top250?start=125
https://movie.douban.com/top250?start=150
https://movie.douban.com/top250?start=175
https://movie.douban.com/top250?start=200
https://movie.douban.com/top250?start=225
爬取成功

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值