python爬虫：分析Ajax请求爬取今日头条街拍图

最新推荐文章于 2020-04-09 12:41:49 发布
牛奶可乐anmmm
最新推荐文章于 2020-04-09 12:41:49 发布
阅读量929
点赞数 2
文章标签： python
本文链接：https://blog.csdn.net/cn_honor/article/details/80917889
版权
import requests
from requests.exceptions import RequestException
import json
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import os
from hashlib import md5
import re

def get_page(url, data):
    try:
        response = requests.get(url, data)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException as e:
        return e

def parse_page(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

"""
def get_page_num(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('gallary_image_count')

def generate_page(de_url, num):  # 此处是生成图片网页代码 但并不是图片原网址 无法下载图片源码
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
    }
    group_url = list()
    pages = num + 1          # 索引由1开始 +1 索引值
    try:
        response = requests.get(de_url, headers = headers)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'lxml')
            title = soup.select('title')[0].get_text()
    except RequestException as e:
        return e
    for page in range(1, pages):
        url = de_url + str("#p=") + str(page)
        group_url.append(url)
    for group in group_url:download_images(group)
    return {
        'title': title,
        'Images_url': group_url
    }
"""
def get_page_detail(de_url):   # 获取网站源码 用于解析组图网址
    requ = requests.session()
    headers = {
        'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.108Safari / 537.36',
        'Remote Address': '153.3.235.87:443',
        'Referrer Policy': 'no - referrer - when - downgrade'
    }
    # 由于网页重定向至新的https网址: 这用替换方法
    # url = 'https://www.toutiao.com/group/6526518758926713347/'
    # pattern = re.compile('(.*)')
    # result = re.search(pattern, de_url)
    # url = result.group(1).replace('http://', 'https://www.')     # 替换

    # 使用requests重定向方法：
    try:
        redirection = requ.head(de_url, allow_redirects=True)
        response = requ.get(redirection.url, allow_redirects=False, headers=headers)  # 这里禁止重定向 获取header中url
        print(response.url)
        url = response.headers['location']
        resp = requ.get(url)
        if resp.status_code == 200:
            return resp.text
        else:
            print("请求失败")
            return None
    except RequestException as e:
        print(e)

def parse_page_detail(html, de_url):  # 获取组图子图网址
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('title')[0].get_text()
    images_pattern = re.compile('mediaInfo:.*?gallery: JSON.parse.*?\"(.*)\".*?siblingList', re.S)  # () 转义存在问题
    result = re.search(images_pattern, html)
    if result:
        test = re.sub(r'\\"', r'"', result.group(1))        # 以 r 开始表示不转义
        re_url = re.sub(r'\\/', r'/', test)          # 替换得到 \/ 难以理解 ！----------------------------->插眼

        # url_pattern = re.compile(r'http:.*?/.*?/(.*?)(com).*?/(origin).*?/(.*?)\"')
        # re_url = re.findall(url_pattern, result.group(1))
        # print(r'\\\')  python中字符串不能以 \ 结尾
        # data = json.loads(test, encoding='utf-8')
        # 碰到一个问题：
        # son.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
        # 解释说是，JSON字符串中，不能包含单引号，而必须是双引号
        # 参考解决方案https://www.crifan.com/python_json_loads_valueerror_expecting_property_name/
        # 部分源码：
        """
        {\"count\":7,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/66b20003be4611dea592\\
        ",\"width\":800,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"}],\"uri\":\"origin\\/66b20003be4611dea592\",\
        \"height\":1186
        """
        # 错误原由：由于\将”转义 而json 中key必须用双引号括起 故而报错
        # 解决方案： 利用r 加 \替换

        data = json.loads(re_url)
        if data and 'sub_images' in data.keys():         # 如果 data 不为空 且‘sub_images’在 data.keus()中 则为真
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images: download_images(image)
            return {
                "title": title,
                'de_url': de_url,
                'images': images
            }

def download_images(url):        # 下载图片代码
    print('正在下载...', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_images(response.content)
        return None
    except RequestException:
        return None

def save_images(content):         # 保存图片
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')  # os.getcwd 返回当前文件路径
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()
    print('保存成功...')

def main(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 3,
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    # urlencode 将dict类型转换为str
    html = get_page(url, data)
    url = parse_page(html)
    for de_url in url:
        text = get_page_detail(de_url)
        print(text)
        parse_page_detail(text, de_url)


if __name__ == '__main__':
    main(0, '街拍')