爬取猫眼top100并存入csv文件中

使用python爬虫爬取猫眼电影top100

import requests
import re
import csv

if __name__=="__main__":
  headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4533.400',
    'cookie': '_lxsdk_cuid=17d0f62debfc8-09e4fac62d84aa-3354417a-1fa400-17d0f62dec0c8; uuid_n_v=v1; uuid=582D8EA067CD11EC87FF41B58677579835E32572E21D4849B2F79FE6371BAFB4; _csrf=3be8f03db07ebdc616f0ab10704dca9c93c34a68200c6c288766bfb601fa768c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1640689219; _lxsdk=582D8EA067CD11EC87FF41B58677579835E32572E21D4849B2F79FE6371BAFB4; uid=747632158; uid.sig=GeqQIL9qRi6J9Wrytom7Z5gTRMQ; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1640695325; __mta=256733492.1636640677618.1640695323364.1640695325938.25; _lxsdk_s=17e00f5674b-4f9-d75-f66%7C%7C30'
  }

  url0 = 'https://www.maoyan.com'

  datapacket = []

  pattern = re.compile('<dd>.*?board-index.*?>(\d+)<.*?href="(.*?)".*?title="(.*?)"'
                       + '.*?star">[\\s]*(.*?)[\\n][\\s]*</p>.*?'
                       + 'releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?'
                       + 'fraction">(.*?)</i>.*?</dd>', re.S)
  # 第1页网址https://maoyan.com/board/4?offset=0,第2页网址https://maoyan.com/board/4?offset=10,依次类推
  for i in range(0, 10):
    url = 'https://www.maoyan.com/board/4?offset=' + str(i * 10)
    # use thread
    resp = requests.get(url=url, headers=headers)
    html_data = resp.text

    if resp.url != url:
      print('\n')
      print(resp.url)
      print(
        '\n' + 'Sorry, you have encountered the anti-crawl verification mechanism of maoyan eye website! Please hold down the Ctrl key to click on the above url for verification, re-run the program!')
      break
    else:

      items = re.findall(pattern, html_data)

      for item in items:

        url1 = url0 + item[1]
        resp = requests.get(url=url1, headers=headers)
        # Check if anti-crawl mechanism is encountered
        if resp.url != url1:
          print(resp.url)
          print(
            'Sorry, you have encountered the anti-crawl verification mechanism of maoyan eye website! Please hold down the Ctrl key to click on the above url for verification, re-run the program!')
          break
        else:
          html_data = resp.text
          # 获取电影类型(type)
          html = re.compile('<a.*?class="text-link.*?target="_blank">(.*?)</a>', re.S)
          types = re.findall(html, html_data)
          type = ''
          for i in types:
            type = i + type

          # 获取电影时长(duration)
          html = re.compile('<ul>.*?class="ellipsis">.*?class="ellipsis">(.*?)</li>', re.S)
          duration = re.findall(html, html_data)
          duration = duration[0].strip()[-5:]

          # 获取导演信息(director)
          html = re.compile('导演.*?<a.*?target="_blank".*?class="name">(.*?)</a>', re.S)
          director = re.findall(html, html_data)
          # 获取第一个导演信息(字符串格式)
          director = (director[0].strip())

          # 获取累计票房收入(income)
          html = re.compile('film-mbox.*?film-mbox-item.*?"mbox-name ".*?"mbox-name ">(.*?)</div>', re.S)
          income = re.findall(html, html_data)
          income = tuple(income)


          filmdata = (item[0],) + (item[2],) + (type,) + (director,) + (item[3][3:],) + (item[4][5:],) + (duration,) + (
          item[5] + item[6],) + income


          datapacket.append(list(filmdata))

  # 写入csv文件
  with open('./猫眼top100.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.writer(csvfile)
    header = ['排序number', '片名title', '类型type', '导演director', '主演actors', '上映时间date', '片长duration', '评分rating',
              '累计收入income(万元)']
    writer.writerow(header)
    writer.writerows(datapacket)

  print('\n\n' + '程序运行完毕,猫眼top100.csv文件存放在Python默认文件夹内。')

如果爬不下来,可以尝试更换一下cookie,然后再进行爬取

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值