Python requests+ xpath 爬取豆瓣top250信息以及图片

Python requests+ xpath 爬取豆瓣top250信息以及图片

环境:Pycharm 2019
库 requests lxml
评论会写的简洁一点

第一步 获取目标网页的文本

https://movie.douban.com/top250?start=25&filter=
观察目标网站的url构造是

url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='

第一页的start = 0,第二页的start = 25那么我们就找了页面的url的规律,那么基本思路就是通过循环来获取每页的网页信息,再爬取所需要的文本。

def get_text(i):
    url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
    headers = {
        'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    html = response.text
    return html
    #构造请求,获得目标url的文本信息
    观察页面所需属性 文本所在的位置

在这里插入图片描述

第二步,电影信息以及图片的获取

在这里插入图片描述



 html = etree.HTML(html)
    name = html.xpath('//div[@class="hd"]/a/span[1]/text()')
    #这是一整个页面所有的电影名称
    infos = html.xpath('//div[@class="bd"]/p[1]//text()')
    #导演加主演 以及 年份和类别
    roles = [j for i, j in enumerate(infos) if i % 2 == 0]
    type = [j for i, j in enumerate(infos) if i % 2 != 0]
    score = html.xpath('//div[@class="star"]/span[2]/text()')
    quote = html.xpath('//div[@class="bd"]/p[2]/span//text()')
    item = zip(name,roles,type,score,quote)
    #将每页所有信息保存在item中

接下来就是图片的爬取,因为图片并不是直接保存在网页中,所以要通过url获取

	html = etree.HTML(html)
    picture = html.xpath('//div[@class="pic"]/a/img/@src')
    #找到图片的url
    image_name =html.xpath('//div[@class="hd"]/a/span[1]/text()')#图片名称
    return picture,image_name
    #HTML解析文本
    #将picture url信息 以及图片的名字返回

电影信息的下载以及图片的下载

电影信息的下载:

def down_land(data,type):
    if type == 'txt':
        for item in data:
            with open('1.txt','a',encoding='utf-8') as f:
                f.write('电影名称:'+str(item[0].replace(' ',''))+'\n')
                f.write(str(re.sub(r'[\'\xa0\s ]*', '', item[1]))+'\n')
                #正则表达式将\,xa0等消除
                f.write('年份/类别:'+str(item[2].replace(' ','').replace('\n',''))+'\n')#去掉空格
                f.write('评分:'+str(item[3])+'\n')
                f.write('语录:'+str(item[4].replace(' ',''))+'\n')
    elif type == 'csv':
        with open('11.csv','a',encoding='utf-8-sig',newline='') as f:#保存csv时编码格式 utf-o-sig不然会乱码
            writer = csv.writer(f)
            for item in data:
                writer.writerow(item)
    else:
        print('can not find the type what you will downland')

图片的下载:

localPath = 'E:/python-project/film_and_picture/jpg2/'
    x = 0
    for url in picture:
        headers = {
            'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        with codecs.open(localPath + '%d%s.jpg' %((i+x+1),image_name[x]), 'wb') as f:
            f.write(response.content)
        print('正在下载第%d张'%(i+x+1))
        x = x + 1

结果

电影信息
当然省略号的部分如果觉得不美观可以删除,但我觉得不删可能更好
图片

源码

import requests
import re
import csv
import codecs#图片保存时图片名称中文保存方法
from lxml import etree
def get_text(i):
    url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
    headers = {
        'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    html = response.text
    return html
def get_message(html):
    html = etree.HTML(html)
    name = html.xpath('//div[@class="hd"]/a/span[1]/text()')
    infos = html.xpath('//div[@class="bd"]/p[1]//text()')
    roles = [j for i, j in enumerate(infos) if i % 2 == 0]
    type = [j for i, j in enumerate(infos) if i % 2 != 0]
    score = html.xpath('//div[@class="star"]/span[2]/text()')
    quote = html.xpath('//div[@class="bd"]/p[2]/span//text()')
    item = zip(name,roles,type,score,quote)
    # return director
    return item
def down_land(data,type):
    if type == 'txt':
        for item in data:
            with open('1.txt','a',encoding='utf-8') as f:
                f.write('电影名称:'+str(item[0].replace(' ',''))+'\n')
                f.write(str(re.sub(r'[\'\xa0\s ]*', '', item[1]))+'\n')#正则表达式将\,xa0等消除
                f.write('年份/类别:'+str(item[2].replace(' ','').replace('\n',''))+'\n')#去掉空格
                f.write('评分:'+str(item[3])+'\n')
                f.write('语录:'+str(item[4].replace(' ',''))+'\n')
    elif type == 'csv':
        with open('11.csv','a',encoding='utf-8-sig',newline='') as f:#保存csv时编码格式 utf-o-sig不然会乱码
            writer = csv.writer(f)
            for item in data:
                writer.writerow(item)
    else:
        print('can not find the type what you will downland')
def crow(type):
    for i in range(0,250,25):
        j = i/25+1
        text = get_text(i)
        print('正在爬取第%d页\n'%j)
        data = get_message(text)    
        down_land(data,type)
def get_picture(html):
    localPath = 'E:/python-project/text/jpg/'
    html = etree.HTML(html)
    picture = html.xpath('//div[@class="pic"]/a/img/@src')#图片的url
    image_name = html.xpath('//div[@class="hd"]/a/span[1]/text()')#图片名称
    return picture,image_name
def downland_picture(i,picture,image_name):
    localPath = 'E:/python-project/film_and_picture/jpg2/'
    x = 0
    for url in picture:
        headers = {
            'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        with codecs.open(localPath + '%d%s.jpg' %((i+x+1),image_name[x]), 'wb') as f:
            f.write(response.content)
        print('正在下载第%d张'%(i+x+1))
        x = x + 1

if __name__=='__main__':
    number = int(input('chose 1 to download message chose 2 to download picture\n'))
    if number == 2:
        for i in range(0,250,25):
            text = get_text(i)
            j = i/25+1
            picture,image_name = get_picture(text)
            print('正在下载第%d页'%j)
            downland_picture(i,picture,image_name)
    elif number == 1:
        type = input('what type do you want to dowland?\n')
        crow(type)

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值