python 抓取头条街拍图片


#抓取头条图片,存入文本文件
#根据崔大庆视频整理
import requests
import re
import json
import os
from requests.exceptions import RequestException
from multiprocessing import Pool
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from json.decoder import JSONDecodeError

headers = {
    'User-Agent': 'User-Agent  Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Accept': 'application/json, text/javascript',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN'
}

def get_page_index(offset,keyword):
    data={
        'offset':offset,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':20,
        'cur_tab':1
    }
    url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return  None
    except RequestException:
        print('请求索引页面错误')
        return None


def parse_page_index(html):
    try:
        data = json.loads(html)
        #print(data)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
    except JSONDecodeError:
        pass

def get_page_detail(url):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return  None
    except RequestException:
        print('请求索引页面错误')
        return None

#获取详情页
def parse_page_detail(html,url):
    # 需要安装lxml包 安装32位的lxml‑4.0.0‑cp36‑cp36m‑win32.whl
    # https://www.zhihu.com/question/49221958/answer/114914375
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    # 正则获取html的js 内容
    img_pattern = re.compile('var gallery = (.*?);',re.S)
    result = re.search(img_pattern,html)
    if result:
        # 获取sub_images 列表
        data = json.loads(result.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images:down_image(image)
            return {
                'title':title,
                'images':images,
                'url':url
            }

def down_image(url):
    print('正在下载图片',url)
    names = re.split('/',url)
    print('图片名称:',names[4])
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            #response.content -- 图片二进制
            #response.text -- 文字
            save_image(response.content,name=names[4])
        return  None
    except RequestException:
        print('请求图片错误',url)
        return None

#下载 照片,需要在工程建img文件夹
def save_image(content,name):
    file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'img',name,'jpg')
    print('图片路径',file_path)
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()


#写入文本文件
def writeToFile(content):
    with open("toutiaojiepai.txt",'a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False) + "\n")
        f.close()


def main(offset):
    html = get_page_index(offset,'街拍')
    #print(html)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html,url)
            #print(result)
            writeToFile(result)


if __name__ == '__main__':
    #main()
    groups = [x*20 for x in range(1,21)]
    pool = Pool()
    pool.map(main,groups)








评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值