lxml 爬取豆瓣top250

lxml 爬取豆瓣top250

菜鸟奋战10小时,得到豆瓣top250

from time import sleep

import urllib3
import pandas as pd
urllib3.disable_warnings()
from lxml import etree


def getTree(url):
    # print(url)
    send_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8"
    }
    pool_manager = urllib3.PoolManager()
    res = pool_manager.request('GET', url, send_headers)
    # 获取响应的数据并设置字符集编码
    r = res.data.decode()
    # print(r)
    return etree.HTML(r)


def getPageInfo(url_page):
    tree = getTree(url_page)
    ol = tree.xpath('//ol[@class="grid_view"]')[0]
    lis = ol.xpath('//li')
    # print(len(lis))
    # 总的名称列表
    names = []
    # 导演演员
    infos =()
    stars = []
    nums = []
    qutos = []
    for li in lis:
        name = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span/text()')
        name_1 = [x.replace('\xa0/\xa0', '') for x in name]
        names_final = ''.join(name_1)

        p = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class]/text()')

        star = li.xpath(
            'div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
        num = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()')
        quto = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')

        p_split = [e.split() for e in p]
        info=(' ',' ',' ',' ')
        if(len(p_split)>0):
            director = ' '.join(p_split[0])
            year_country_type = ' '.join(p_split[1])
            # print(year_country_type)
            year = year_country_type.split('/')[0].strip()
            country = year_country_type.split('/')[1].strip()
            type = year_country_type.split('/')[2].strip()
            # info元组
            info = (director, year, country, type)

        if (len(names_final) > 0):
            names.append(names_final)
            if (info[0] != ' '):
                infos += info
            else:
                infos += info

            if (len(star) > 0):
                stars.extend(star)
            else:
                stars.extend(' ')

            if (len(num) > 0):
                nums.extend(num)
            else:
                nums.extend(' ')

            if (len(quto) > 0):
                qutos.extend(quto)
            else:
                qutos.extend(' ')

        else:
            continue

            # print(li,names_final)


    # print(names)
    # print(infos)
    # print(stars)
    # print(nums)
    # print(qutos)
    return names, infos, stars, nums, qutos


def main(url_home):
    pages_count = 250 // 25
    namess = []
    infoss =()
    starss = []
    numss = []
    qutoss = []
    for i in range(pages_count):

        page_url = url_home + '?start=' + str(25 * i)+'&filter='
        # 间隔5s抓取
        sleep(2)
        names, infos, stars, nums, qutos = getPageInfo(page_url)

        # print(i, names,infos,stars,nums,qutos,sep='\n',end='\n')
        namess.extend(names)
        infoss+=infos
        starss.extend(stars)
        numss.extend(nums)
        qutoss.extend(qutos)


    # print(i, namess,infoss,starss,numss,qutoss,sep='\n',end='\n')
    actors = infoss[0::4]
    years = infoss[1::4]
    countries = infoss[2::4]
    types = infoss[3::4]

    data = {'namess': namess, 'actors': actors, 'years': years, 'countries': countries, 'types': types,
            'starss': starss, 'numss': numss, 'qutoss': qutoss}
    print(data)
    df = pd.DataFrame(data)
    df.to_excel('douban.xls')
    df.to_csv('douban.csv')
    print('finished!')

url_home = "https://movie.douban.com/top250"

if __name__ == '__main__':
    main(url_home)
    # getPageInfo(url_home)


效果

在这里插入图片描述
在这里插入图片描述

多多指教

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值