Python requests+ xpath 爬取豆瓣top250信息以及图片

最新推荐文章于 2021-11-12 14:53:28 发布

明啊明啊明

最新推荐文章于 2021-11-12 14:53:28 发布

阅读量720

点赞数

分类专栏： python 文章标签： python爬虫 requests 豆瓣电影评论图片爬取

本文链接：https://blog.csdn.net/H__ello_world/article/details/102972622

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Python requests+ xpath 爬取豆瓣top250信息以及图片

环境：Pycharm 2019
库 requests lxml
评论会写的简洁一点

第一步获取目标网页的文本

https://movie.douban.com/top250?start=25&filter=
观察目标网站的url构造是

url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='

第一页的start = 0,第二页的start = 25那么我们就找了页面的url的规律，那么基本思路就是通过循环来获取每页的网页信息，再爬取所需要的文本。

def get_text(i):
    url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
    headers = {
        'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    html = response.text
    return html
    #构造请求，获得目标url的文本信息
    观察页面所需属性 文本所在的位置

在这里插入图片描述

第二步，电影信息以及图片的获取

在这里插入图片描述



 html = etree.HTML(html)
    name = html.xpath('//div[@class="hd"]/a/span[1]/text()')
    #这是一整个页面所有的电影名称
    infos = html.xpath('//div[@class="bd"]/p[1]//text()')
    #导演加主演 以及 年份和类别
    roles = [j for i, j in enumerate(infos) if i % 2 == 0]
    type = [j for i, j in enumerate(infos) if i % 2 != 0]
    score = html.xpath('//div[@class="star"]/span[2]/text()')
    quote = html.xpath('//div[@class="bd"]/p[2]/span//text()')
    item = zip(name,roles,type,score,quote)
    #将每页所有信息保存在item中

接下来就是图片的爬取，因为图片并不是直接保存在网页中，所以要通过url获取

	html = etree.HTML(html)
    picture = html.xpath('//div[@class="pic"]/a/img/@src')
    #找到图片的url
    image_name =html.xpath('//div[@class="hd"]/a/span[1]/text()')#图片名称
    return picture,image_name
    #HTML解析文本
    #将picture url信息 以及图片的名字返回

电影信息的下载以及图片的下载

电影信息的下载：

def down_land(data,type):
    if type == 'txt':
        for item in data:
            with open('1.txt','a',encoding='utf-8') as f:
                f.write('电影名称:'+str(item[0].replace(' ',''))+'\n')
                f.write(str(re.sub(r'[\'\xa0\s ]*', '', item[1]))+'\n')
                #正则表达式将\,xa0等消除
                f.write('年份/类别:'+str(item[2].replace(' ','').replace('\n',''))+'\n')#去掉空格
                f.write('评分:'+str(item[3])+'\n')
                f.write('语录:'+str(item[4].replace(' ',''))+'\n')
    elif type == 'csv':
        with open('11.csv','a',encoding='utf-8-sig',newline='') as f:#保存csv时编码格式 utf-o-sig不然会乱码
            writer = csv.writer(f)
            for item in data:
                writer.writerow(item)
    else:
        print('can not find the type what you will downland')

图片的下载：

localPath = 'E:/python-project/film_and_picture/jpg2/'
    x = 0
    for url in picture:
        headers = {
            'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        with codecs.open(localPath + '%d%s.jpg' %((i+x+1),image_name[x]), 'wb') as f:
            f.write(response.content)
        print('正在下载第%d张'%(i+x+1))
        x = x + 1

结果

电影信息
当然省略号的部分如果觉得不美观可以删除，但我觉得不删可能更好

源码

import requests
import re
import csv
import codecs#图片保存时图片名称中文保存方法
from lxml import etree
def get_text(i):
    url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
    headers = {
        'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    html = response.text
    return html
def get_message(html):
    html = etree.HTML(html)
    name = html.xpath('//div[@class="hd"]/a/span[1]/text()')
    infos = html.xpath('//div[@class="bd"]/p[1]//text()')
    roles = [j for i, j in enumerate(infos) if i % 2 == 0]
    type = [j for i, j in enumerate(infos) if i % 2 != 0]
    score = html.xpath('//div[@class="star"]/span[2]/text()')
    quote = html.xpath('//div[@class="bd"]/p[2]/span//text()')
    item = zip(name,roles,type,score,quote)
    # return director
    return item
def down_land(data,type):
    if type == 'txt':
        for item in data:
            with open('1.txt','a',encoding='utf-8') as f:
                f.write('电影名称:'+str(item[0].replace(' ',''))+'\n')
                f.write(str(re.sub(r'[\'\xa0\s ]*', '', item[1]))+'\n')#正则表达式将\,xa0等消除
                f.write('年份/类别:'+str(item[2].replace(' ','').replace('\n',''))+'\n')#去掉空格
                f.write('评分:'+str(item[3])+'\n')
                f.write('语录:'+str(item[4].replace(' ',''))+'\n')
    elif type == 'csv':
        with open('11.csv','a',encoding='utf-8-sig',newline='') as f:#保存csv时编码格式 utf-o-sig不然会乱码
            writer = csv.writer(f)
            for item in data:
                writer.writerow(item)
    else:
        print('can not find the type what you will downland')
def crow(type):
    for i in range(0,250,25):
        j = i/25+1
        text = get_text(i)
        print('正在爬取第%d页\n'%j)
        data = get_message(text)    
        down_land(data,type)
def get_picture(html):
    localPath = 'E:/python-project/text/jpg/'
    html = etree.HTML(html)
    picture = html.xpath('//div[@class="pic"]/a/img/@src')#图片的url
    image_name = html.xpath('//div[@class="hd"]/a/span[1]/text()')#图片名称
    return picture,image_name
def downland_picture(i,picture,image_name):
    localPath = 'E:/python-project/film_and_picture/jpg2/'
    x = 0
    for url in picture:
        headers = {
            'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        with codecs.open(localPath + '%d%s.jpg' %((i+x+1),image_name[x]), 'wb') as f:
            f.write(response.content)
        print('正在下载第%d张'%(i+x+1))
        x = x + 1

if __name__=='__main__':
    number = int(input('chose 1 to download message chose 2 to download picture\n'))
    if number == 2:
        for i in range(0,250,25):
            text = get_text(i)
            j = i/25+1
            picture,image_name = get_picture(text)
            print('正在下载第%d页'%j)
            downland_picture(i,picture,image_name)
    elif number == 1:
        type = input('what type do you want to dowland?\n')
        crow(type)