爬取猫眼top100并存入csv文件中

最新推荐文章于 2024-07-26 17:36:26 发布

撑把伞

最新推荐文章于 2024-07-26 17:36:26 发布

阅读量1.4k

点赞数

分类专栏： python 爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_45678598/article/details/122733896

版权

python 同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

爬虫

2 篇文章 0 订阅

订阅专栏

使用python爬虫爬取猫眼电影top100

import requests
import re
import csv

if __name__=="__main__":
  headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4533.400',
    'cookie': '_lxsdk_cuid=17d0f62debfc8-09e4fac62d84aa-3354417a-1fa400-17d0f62dec0c8; uuid_n_v=v1; uuid=582D8EA067CD11EC87FF41B58677579835E32572E21D4849B2F79FE6371BAFB4; _csrf=3be8f03db07ebdc616f0ab10704dca9c93c34a68200c6c288766bfb601fa768c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1640689219; _lxsdk=582D8EA067CD11EC87FF41B58677579835E32572E21D4849B2F79FE6371BAFB4; uid=747632158; uid.sig=GeqQIL9qRi6J9Wrytom7Z5gTRMQ; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1640695325; __mta=256733492.1636640677618.1640695323364.1640695325938.25; _lxsdk_s=17e00f5674b-4f9-d75-f66%7C%7C30'
  }

  url0 = 'https://www.maoyan.com'

  datapacket = []

  pattern = re.compile('<dd>.*?board-index.*?>(\d+)<.*?href="(.*?)".*?title="(.*?)"'
                       + '.*?star">[\\s]*(.*?)[\\n][\\s]*</p>.*?'
                       + 'releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?'
                       + 'fraction">(.*?)</i>.*?</dd>', re.S)
  # 第1页网址https://maoyan.com/board/4?offset=0，第2页网址https://maoyan.com/board/4?offset=10，依次类推
  for i in range(0, 10):
    url = 'https://www.maoyan.com/board/4?offset=' + str(i * 10)
    # use thread
    resp = requests.get(url=url, headers=headers)
    html_data = resp.text

    if resp.url != url:
      print('\n')
      print(resp.url)
      print(
        '\n' + 'Sorry, you have encountered the anti-crawl verification mechanism of maoyan eye website! Please hold down the Ctrl key to click on the above url for verification, re-run the program!')
      break
    else:

      items = re.findall(pattern, html_data)

      for item in items:

        url1 = url0 + item[1]
        resp = requests.get(url=url1, headers=headers)
        # Check if anti-crawl mechanism is encountered
        if resp.url != url1:
          print(resp.url)
          print(
            'Sorry, you have encountered the anti-crawl verification mechanism of maoyan eye website! Please hold down the Ctrl key to click on the above url for verification, re-run the program!')
          break
        else:
          html_data = resp.text
          # 获取电影类型(type)
          html = re.compile('<a.*?class="text-link.*?target="_blank">(.*?)</a>', re.S)
          types = re.findall(html, html_data)
          type = ''
          for i in types:
            type = i + type

          # 获取电影时长(duration)
          html = re.compile('<ul>.*?class="ellipsis">.*?class="ellipsis">(.*?)</li>', re.S)
          duration = re.findall(html, html_data)
          duration = duration[0].strip()[-5:]

          # 获取导演信息(director)
          html = re.compile('导演.*?<a.*?target="_blank".*?class="name">(.*?)</a>', re.S)
          director = re.findall(html, html_data)
          # 获取第一个导演信息（字符串格式）
          director = (director[0].strip())

          # 获取累计票房收入(income)
          html = re.compile('film-mbox.*?film-mbox-item.*?"mbox-name ".*?"mbox-name ">(.*?)</div>', re.S)
          income = re.findall(html, html_data)
          income = tuple(income)


          filmdata = (item[0],) + (item[2],) + (type,) + (director,) + (item[3][3:],) + (item[4][5:],) + (duration,) + (
          item[5] + item[6],) + income


          datapacket.append(list(filmdata))

  # 写入csv文件
  with open('./猫眼top100.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.writer(csvfile)
    header = ['排序number', '片名title', '类型type', '导演director', '主演actors', '上映时间date', '片长duration', '评分rating',
              '累计收入income（万元）']
    writer.writerow(header)
    writer.writerows(datapacket)

  print('\n\n' + '程序运行完毕,猫眼top100.csv文件存放在Python默认文件夹内。')

如果爬不下来，可以尝试更换一下cookie，然后再进行爬取

撑把伞

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
爬取猫眼top100并存入csv文件中

使用python爬虫爬取猫眼电影top100import requestsimport reimport csvif __name__=="__main__": headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowse
复制链接

扫一扫

专栏目录