爬取今日头条街拍图片

最新推荐文章于 2021-11-27 17:38:56 发布

Congcongrobot

最新推荐文章于 2021-11-27 17:38:56 发布

阅读量545

点赞数 2

分类专栏： python爬虫学习文章标签： python爬虫学习

本文链接：https://blog.csdn.net/Congcongrobot/article/details/89646277

版权

python爬虫学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

爬取今日头条街拍图片

# coding=utf-8
import os
import re
import time
from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode


headers={'Cookie': '你的cookie',
         'User-Agent':'你的user-agent'
        }
#得到首页面上相应详情页的url
def get_search_page(offset):
    params = {
        'aid':'24',
        'app_name': 'web_search',
        'offset': offset,
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp': int(round(time.time() * 1000)),
    }
    base_url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
    print(base_url)
    try:
        response1 = requests.get(base_url,headers=headers)
        #print(type(response1))
        #search_res = json.loads(requests.get(url+urlencode(params),headers=headers).text,encoding='utf-8')['data']
        if response1.status_code == 200:
            json1=response1.json()
            #print(json1)
            #if json1.get('data'):
            for item in json1.get('data'):
                title=item.get('title')
                url_group=item.get('share_url')
                try:
                    if title!=None and url_group!=None:
                        yield {'title':item.get('title'),
                               'url_group':item.get('share_url')
                            }
                except:
                    print('非图片类型页面')
    except requests.ConnectionError:
        return None

#得到图片的集合
def get_images_group(url_group):
    print('正在进行详情页面解析')
    response2=requests.get(url_group,headers=headers)
    try:
        if response2.status_code == 200:
            content1s=re.findall('[a-zA-z]+://[^\s]*&quot',response2.text,re.S)
            if (content1s==None):
                print("文章类型不对，没有找到图片集合",url_group)
                return None
            #content3s=re.findall('/pgc-image/[^\s]*&quot',response2.text,re.S)
            #print(content1s)
            for content1 in content1s:
                content2=re.sub('&quot','',content1)
                content3=re.sub('[a-zA-z]+://[^\s]*/pgc-image/','',content2)
                yield {
                    'name': content3,
                    'image_url': content2,
                    }
            #print(response2.text)
    except ConnectionError:
        print('无法连接')
        return None
#保存图片
def save_image(item,title):
    try:
        response = requests.get(item.get('image_url'))
        if response.status_code == 200:
            file_path1='C:/Users/Desktop/图片/爬虫/'+title+'/'
            #print(file_path1)
            file_path=file_path1+'{0}.{1}'.format(item.get('name'),'jpg')
            if not os.path.exists(file_path):
                try:
                    os.makedirs(file_path1)
                except:
                    print('路径已经存在', file_path1)
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                    f.close()
            else:
                print('已经下载', file_path)
    except requests.ConnectionError:
        print('保存图片失败')

#主函数
def main(offset):
    for item1 in get_search_page(offset):
        url_group= item1.get('url_group')
        title=item1.get('title')
        for item2 in get_images_group(url_group):
            save_image(item2,title)
    print(offset)


GROUP_START = 1
GROUP_END = 20

if __name__ == '__main__':
    pool = Pool()# 创建进程池,可以选择创建进程池的数量
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)# 将数组中的每个元素提取出来当作函数的参数，创建一个个进程，放进进程池中
                          # 第一个参数是函数，第二个参数是一个迭代器，将迭代器中的数字作为参数依次传入函数中
    pool.close()#关闭进程池，不再接受新的进程
    pool.join()#主进程阻塞等待子进程的退出

爬取结果