爬虫一:豆瓣电影关键字爬取 并存入excel

爬虫一:豆瓣电影关键字爬取

爬虫一:豆瓣电影关键字爬取 并存入excel

https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%83%82%E7%89%87&start=20

豆瓣电影信息还是比较好拿到的

 headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
                "Cookie": 'll="118245"; bid=ARtOOxDRteM; __utma=30149280.1180608368.1569232683.1569232683.1569232683.1; __utmc=30149280; __utmz=30149280.1569232683.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; acw_tc=2760828115692326854383671ef4946949a1588e4e0476180bf57b1e5f944d; ap_v=0,6.0; __utma=223695111.1224307206.1569232715.1569232715.1569232715.1; __utmb=223695111.0.10.1569232715; __utmc=223695111; __utmz=223695111.1569232715.1.1.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1569232715%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban.com%252F%22%5D; _pk_ses.100001.4cf6=*; push_noty_num=0; push_doumail_num=0; __yadk_uid=jOpNK0YqTHKGC26G4EsFha6rAzJzvStK; _vwo_uuid_v2=D2DBA0AE8FBA3AA58155A7BB3CF5E42B0|0fc9726cdfeda577fffd4d9b62d1989e; dbcl2="199182842:8rksCNklcW8"; ck=Taef; __utmv=30149280.19918; _pk_id.100001.4cf6=62084c1c85a029b2.1569232715.1.1569233089.1569232715.; __utmb=30149280.17.10.1569232683'
            }

首先定义headers请求头 让网站认为我们是人为的访问

content = requests.get(url, headers=headers)
            content_json = json.loads(content.text)["data"]
            for one_info in content_json:
                one_id = one_info["id"]
                print(one_id)
                url2 = "https://movie.douban.com/subject/%s/"%one_id
                # content_html = requests.get(url, headers=headers)
                html = requests.get(url2, headers=headers)
                if html.status_code == 200:
                    content = html.content.decode("utf-8")
                    content = etree.HTML(content)
                    save_info(content)
                time.sleep(1)

发出请求 得到html数据

info = content.xpath("//div[@id='info']")[0]
    try:
        name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
    except:
        name = "无"
    try:
        daoyan =  str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
    except:
        daoyan = "无"
    try:
        bianju =  str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
    except:
        bianju = "无"
    try:
        zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
    except:
        zhuyan = "无"
    try:
        leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
    except:
        leixing = "无"
    try:
        shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
    except:
        shangyingshijian = "无"
    try:
        shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
    except:
        shichang = "无"
    try:
        pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
    except:
        pingfen = "无"
    try:
        jianjie =  str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
    except:
        jianjie = "无"
    # tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
    try:
        pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
    except:
        pingjiarenshu = "无"
    print("电影名称:", name)
    print("导演:", daoyan)
    print("编剧:", bianju)
    print("主演:", zhuyan)
    print("评分:", pingfen)
    print("评价人数:", pingjiarenshu)
    print("类型:", leixing)
    print("上映时间:", shangyingshijian)
    print("时长:", shichang)
    print("简介:", jianjie)
    # print("图片url:", tupian)
    one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
    all_list.append(one_info)

通过xpath提取数据

def processing_data(content_list):
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('My Worksheet')
    # 写入excel
    for i, content in enumerate(content_list):
        for x, info in enumerate(content):
            worksheet.write(i, x, label=info)  # 将数据存入excel
    # 保存
    workbook.save('电影信息.xls')

将数据存入excel

完整代码

import time

import xlwt
from lxml import etree
import requests
import json


def processing_data(content_list):
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('My Worksheet')
    # 写入excel
    for i, content in enumerate(content_list):
        for x, info in enumerate(content):
            worksheet.write(i, x, label=info)  # 将数据存入excel
    # 保存
    workbook.save('电影信息.xls')


def save_info(content):
    info = content.xpath("//div[@id='info']")[0]
    try:
        name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
    except:
        name = "无"
    try:
        daoyan =  str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
    except:
        daoyan = "无"
    try:
        bianju =  str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
    except:
        bianju = "无"
    try:
        zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
    except:
        zhuyan = "无"
    try:
        leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
    except:
        leixing = "无"
    try:
        shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
    except:
        shangyingshijian = "无"
    try:
        shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
    except:
        shichang = "无"
    try:
        pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
    except:
        pingfen = "无"
    try:
        jianjie =  str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
    except:
        jianjie = "无"
    # tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
    try:
        pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
    except:
        pingjiarenshu = "无"
    print("电影名称:", name)
    print("导演:", daoyan)
    print("编剧:", bianju)
    print("主演:", zhuyan)
    print("评分:", pingfen)
    print("评价人数:", pingjiarenshu)
    print("类型:", leixing)
    print("上映时间:", shangyingshijian)
    print("时长:", shichang)
    print("简介:", jianjie)
    # print("图片url:", tupian)
    one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
    all_list.append(one_info)


def main():
    try:
        for x in range(0,60):
            url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%83%82%E7%89%87&start='+ str(x*20)

            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
            }

            content = requests.get(url, headers=headers)
            content_json = json.loads(content.text)["data"]
            for one_info in content_json:
                one_id = one_info["id"]
                print(one_id)
                url2 = "https://movie.douban.com/subject/%s/"%one_id
                # content_html = requests.get(url, headers=headers)
                html = requests.get(url2, headers=headers)
                if html.status_code == 200:
                    content = html.content.decode("utf-8")
                    content = etree.HTML(content)
                    save_info(content)
                time.sleep(1)
    except:
        processing_data(all_list)


if __name__ == '__main__':
    all_list = []
    main()
    processing_data(all_list)

本人在校学生欢迎来交流python技术
QQ:5834135

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值