爬取豆瓣电影数据

采集豆瓣数据时,最好加上代理,负责被封可不负责

直接上代码

# -*- coding: utf-8 -*-

import requests, json, time, re, csv, random
from lxml import etree

"""
__Author__: luoshen
__Date: 2020-06-19
"""


class Spider(object):
    header = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 '
                      'Safari/537.36'
    }

    proxy = [
        {"proxy": "", "proxy_scheme": "http"},
        {"proxy": "", "proxy_scheme": "http"},
        {"proxy": "", "proxy_scheme": "http"},
        {"proxy": "", "proxy_scheme": "http"},
        {"proxy_scheme": "https", "proxy": ""},
        {"proxy_scheme": "http", "proxy": ""},
        {"proxy_scheme": "http", "proxy": ""},
        {"proxy_scheme": "http", "proxy": ""},
        {"proxy_scheme": "http", "proxy": ""}
    ]

    scraped_urls = set()

    csv_header = ['title', 'runtime', 'ReleaseDate', 'attrs', 'stars', 'commits', 'type', 'actors', 'state', 'language']

    # def __init__(self):
    #     with open('douban.csv', 'w') as f:
    #         f_csv = csv.writer(f)
    #         f_csv.writerow(self.csv_header)

    def get_content(self, url):
        time.sleep(1)
        try:
            item = random.choice(self.proxy)
            proxy = {
                item['proxy_scheme']: item['proxy']
            }
            req = requests.get(url, headers=self.header, proxies=proxy, timeout=2)
            if req.status_code == 200:
                return req.text
            else:
                self.get_content(url)
        except:
            self.get_content(url)

    def json_to_dict(self, content):
        content_dict = json.loads(content)
        return content_dict

    def Analytic_content_get_url(self, dict):
        urllist = list()
        for item in dict['data']:
            if item['url'] in self.scraped_urls:
                pass
            else:
                urllist.append(item['url'])
        return urllist

    def Analytic_moive_page_content(self, content):
        all = list()
        html = etree.HTML(content)
        for item in html.xpath('//*[@id="content"]'):
            title = item.xpath('./h1/span[1]/text()')  # 标题
            all.append(''.join(title))
        for item in html.xpath('//div[@id="info"]'):
            runtime = item.xpath('./span[@property="v:runtime"]/text()')  # 片长
            ReleaseDate = item.xpath('./span[@property="v:initialReleaseDate"]/text()')  # 上映时间
            attrs = item.xpath('./span[1]/span[@class="attrs"]/a/text()')  # 导演
            all.append(''.join(runtime))
            all.append(','.join(ReleaseDate))
            all.append(''.join(attrs))
        stars = re.findall('property="v:average">(.*?)</strong>', content)  # 评分
        state = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>', content)  # 国家
        actors = re.findall('rel="v:starring">(.*?)</a>', content)  # 主演
        type = re.findall('<span property="v:genre">(.*?)</span>', content)  # 类型
        language = re.findall('<span class="pl">语言:</span> (.*?)<br/>', content)  # 语言
        commits = re.findall(r'>(全部 \d+ 条)</a>', content)[0]
        all.append(''.join(stars))
        all.append(''.join(commits))
        all.append(''.join(type))
        all.append(','.join(actors))
        all.append(','.join(state))
        all.append(','.join(language))
        print('----------------------------------------')
        for item in all:
            print(item)
        return all

    def write_to_csv(self, items):
        with open('douban.csv', 'a') as f:
            a = csv.writer(f)
            if items is None:
                pass
            else:
                a.writerow(items)
                print(items[0], 'write to csv')

    def run(self):
        for i in range(0, 100, 20):
            url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,' \
                  '10&tags=%E7%94%B5%E5%BD%B1&start={}'.format(i)
            content = S.get_content(url)
            content_dict = S.json_to_dict(content)
            urllist = S.Analytic_content_get_url(content_dict)
            for url in urllist:
                movie_page_content = S.get_content(url)
                movie_content = S.Analytic_moive_page_content(movie_page_content)
                # S.write_to_csv(movie_content)


if __name__ == '__main__':
    S = Spider()
    S.run()

 

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值