笔记-抓取今日头条图片

最新推荐文章于 2020-09-14 19:58:31 发布

人生三醒七分醉

最新推荐文章于 2020-09-14 19:58:31 发布

阅读量299

点赞数

分类专栏：爬虫实战

本文链接：https://blog.csdn.net/qq_32942549/article/details/79601757

版权

爬虫实战专栏收录该内容

5 篇文章 0 订阅

订阅专栏

分析Ajax抓取街拍图片，将url等数据存储为json格式，并把图片保存下来

#coding:utf-8
import os
from _md5 import md5
from urllib.parse import urlencode

import re
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import requests
import json
from multiprocessing import Pool
def get_page_index(offset,keyword):#Ajax传入参数
    data={
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3,
        'from': 'gallery'
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except ConnectionError:
        print('请求索引值出错')
        return None
def parse_page_index(html):#解析组图
    data=json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')
def get_page_detail(url):#请求详情页
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except ConnectionError:
        print('请求详情页出错')
        return None
def parse_page_detail(html,url):#解析详情页
    soup=BeautifulSoup(html,'lxml')
    # print(soup.get_text())
    title=soup.select('title')[0].get_text()
    # print(title)
    images_pattern=re.compile('gallery:.*?\("(.*?)"\)',re.S)
    html = re.sub(r'\\', '', html)
    result=re.search(images_pattern,html)
    if result:
        data=json.loads(result.group(1),"UTF-8")
        # # print(result)
        if data and 'sub_images'in data.keys():
            sub_images=data.get('sub_images')
            images=[item.get('url') for item in sub_images]
            return {
                'title':title,
                'url':url,
                'images':images,
            }
def write_to_file(content):#保存为Json格式
    with open('result_jiepai.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()
def get_parse_image(url):
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            image_save(response.content)
        return None
    except ConnectionError:
        print('图片解析出错')
        return None

def image_save(content):#保存图片
    # content=get_parse_image(content)
    file_path='{0}/{1}.{2}'.format(os.getcwd()+'/images',md5(content).hexdigest(),'jpg')
    with open(file_path,'wb') as f:
        f.write(content)
        f.close()

def main(offset):
    html=get_page_index(offset,'街拍')
    for url in parse_page_index(html):
        html=get_page_detail(url)
        if html:
            result=parse_page_detail(html,url)
            write_to_file(result)
            try:
                for item in result.get('images') if result.get('images') else None:
                    get_parse_image(item)
                    print('正在下载：',item)
            except Exception as e:
                print('下载出错：',e)
                # continue


if __name__=="__main__":
    pool=Pool()
    pool.map(main,[x*20 for x in range(1,21)])

查看下下载的图片