Python爬虫【豆瓣电影Top250】【利用正则表达式】

不知道有没有人看,先不具体讲思路了,直接上代码:

#爬取豆瓣Top250电影信息
import requests
import re
import xlwt

def getHtml(url) :
    try :
        head = {"user-agent":"Mozilla/5.0"}
        r = requests.get(url,headers = head)
        r.raise_for_status
        r.encoding = r.apparent_encoding
        return r.text
    except :
        print("")

def getData(datas,html) :
    Name = re.compile('<span class="title">([^&]+)</span>')
    Link = re.compile('<a href="(.+)" class="">')
    Score = re.compile(' <span class="rating_num" property="v:average">(\d.\d)</span>')
    Raters = re.compile('<span>(\d+)人评价</span>')
    Member = re.compile('.+<br>')
    Info = re.compile('\d{4}.+;[\u4e00-\u9fa5]+.+[\u4e00-\u9fa5]+')
    raw_names = Name.findall(html)
    raw_links = Link.findall(html)
    raw_scores = Score.findall(html)
    raw_raters = Raters.findall(html)
    raw_members = Member.findall(html)
    raw_infos = Info.findall(html)
    try :
        for i in range(len(raw_names)) :
            name =raw_names[i]
            link = raw_links[i]
            score = raw_scores[i]
            rater = raw_raters[i]
            director = re.split('[::&<]',raw_members[i])[1]
            actor = re.split('[::&<]',raw_members[i])[-2]
            year = re.split('[:;&]',raw_infos[i])[0]
            area = re.split('[:;&]',raw_infos[i])[4]
            kind= re.split('[:;&]',raw_infos[i])[-1]
            datas.append([name,link,score,rater,director,actor,year,area,kind])
    except :
        print("")

def saveData(datas,path) :
    book = xlwt.Workbook()
    sheet = book.add_sheet("豆瓣Top250")
    sheet.col(1).width = 6000
    sheet.col(2).width = 10000
    sheet.col(5).width = 6000
    sheet.col(6).width = 6000
    sheet.col(8).width = 6000
    list = ["排名","电影名","链接","评分","评分人数","导演","主演","年份","地区","类型"]    
    for i in range(10) :
        sheet.write(0,i,list[i])    
    for i in range(250) :
        for j in range(10) :
            if j == 0 :
                sheet.write(i+1,0,i+1)
            else :
                data = datas[i]
                sheet.write(i+1,j,data[j-1])
    book.save(path)
    
    

if __name__ == "__main__" :
    start_url = "https://movie.douban.com/top250?start=" 
    path = "C:/Users/Young/Desktop/豆瓣Top250.xls"
    datas = []
    for i in range(10) :
        url = start_url + str(i*25) 
        html = getHtml(url)
        getData(datas,html)
    saveData(datas,path)
    print("豆瓣Top250电影榜单已制作完成!")

将爬取的结果打印至表格中,结果如下:

在这里插入图片描述

  • 5
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

IC_Young

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值