python爬虫

猫眼电影榜单的爬取

from bs4 import BeautifulSoup
import re  # 正则表达式,进行文字匹配
import urllib.request, urllib.error  # 制定url ,获取网页数据
import xlwt  # 进行excel操作


def main ():
    baseurl = "https://maoyan.com/board/4?offset="
    list = getdata(baseurl)
    savepath = "猫眼电影.xls"
    savedate(list,savepath)


rank = re.compile(r'<i class="board-index board-index-.*">(.*?)</i>')
title = re.compile(r'<img alt="(.*?)" class="board-img"')
actor = re.compile(r'<p class="star">(.*?)</p>',re.S)
time = re.compile(r'<p class="releasetime">(.*?)</p>')
rate1 = re.compile(r'<i class="integer">(.*?)</i>')
rate2 = re.compile(r'<i class="fraction">(.*)</i>')

def getdata(baseurl):
    list = []
    for i in range(10):
        url = baseurl+str(i*10)
        headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363"}
        request = urllib.request.Request(url=url,headers=headers)
        reponse = urllib.request.urlopen(request)
        html = reponse.read().decode("utf_8")
        soup = BeautifulSoup(html,"html.parser")


        for item in soup.find_all("dd"):
            # print(item)
            date = []

            item = str(item)
            ranking = re.findall(rank,item)
            date.append(ranking[0])
            titleing = re.findall(title,item)
            date.append(titleing[0])

            actoring = re.findall(actor,item)
            actoring[0] = actoring[0].replace(" ", "")
            actoring[0] = actoring[0].replace("\n", "")

            date.append(actoring[0])
            timeing = re.findall(time,item)
            date.append(timeing[0])
            rate1ing = re.findall(rate1,item)
            rate2ing = re.findall(rate2,html)
            date.append(rate1ing[0]+rate2ing[0])
            list.append(date)
    return  list


def savedate(list,savepath):
    book = xlwt.Workbook(encoding="utf-8")  # 创建book对象
    sheet = book.add_sheet("猫眼电影",cell_overwrite_ok=True)  # 创建工作表
    col = ('排名', '片名', '演员', '发布时间', '评分')
    for i in range(0,5):
        sheet.write(0,i,col[i])   #列名
        for i in range(0,100):
            # print("第%d条" % i)  # 监督进度  可以删除
            data = list[i]
            for j in range(0, 5):
                sheet.write(i + 1, j, data[j])

    book.save(savepath)  # 保存



if __name__ == "__main__":
    main()
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值