爬取今日头条-图片，进一步了解Ajax的分页模拟，以及存入关系型数据库MySQL和非关系型mongodb的基本代码

本文链接：https://blog.csdn.net/qq_42278240/article/details/88676891

爬取今日头条-图片

与上一篇爬取‘今日头条-街拍’分页模拟的区别（其他步骤大致一样）

与上一篇爬取‘今日头条-街拍’分页模拟的区别（其他步骤大致一样）

上一篇爬取‘今日头条-街拍’时，分页的实现可以简单的通过Query String Parameters获取到其固有的分页参数，而这次爬取今日头条-图片集，Query String Parameters中的分页参数是变化的，且无规律，每次刷新，分页参数都会发生变化。

分析Headers

Headers中的Query String Parameters参数如下图

在这里插入图片描述
对GET请求，类似于as、cp、_signature这种掺杂字母的参数，一般是随机生成的，无规律的；
通过切换多个Ajax，可以发现max_behot_time在一定范围内变化，具体以什么规律变化，还不得而知，不过值得关注的是，第一个Ajax中（也就是第一页）max_behot_time：0，再查看Preview，可以发现：
在这里插入图片描述
在next键下有{max_behot_time: 1553000486}，也就是指向下一页的max_behot_time，现在就明白了为什么第一页的max_behot_time为0，相当于关了一扇门给我们开了另一扇窗。接下来，我们的任务就是获取第一页被Ajax渲染的数据，提取下一页的max_behot_time。
但是，当我们以上一篇的方法去获取第一页的数据时，你会发现获取后返回的数据为空，主要是因为每次刷新时，as、cp、_signature、max_behot_time这些参数都是变化的，这时我们就需要来维持它的会话，也就是在代码的headers中加入cookie，来维持会话。

代码构造

‘获取max_behot_time’的过程图

在这里插入图片描述
这是一个循环过程，get_page()函数得到第一页的响应数据，将响应数据传递给get_max_behot_time(response)函数，get_max_behot_time(response)函数提取到max_behot_time后，将max_behot_time传递回get_page()函数，get_page()函数得到第二页的响应数据…。
代码：

max_behot_time=['0']
    while True:
        i=len(max_behot_time)
        j=max_behot_time[i-1]
        #通过提取max_behot_time列表中的值，逐页发送请求
        response=get_page(j)
        get_max_behot_time(response)

这里我们实现了一个无限循环来获取数据，可以获取5000条数据以上。
也可以自己设置成有限循环来获取自己需要的数据量。

代码实现

获取图片帖子、图片来源、帖子的发布人名称、帖子发布人的个人主页、帖子的三张图片
存入mysql、mongodb、本地目录
存入mysql之前需先创建好数据库和数据表

创建数据库：

import pymysql
db = pymysql.connect(host='127.0.0.1', user='root', password='自己的数据库密码', port=3306)
cursor = db.cursor()
cursor.execute("CREATE DATABASE toutiao DEFAULT CHARACTER SET utf8mb4")
db.close()

创建数据表：

import pymysql

db = pymysql.connect(host='127.0.0.1', user='root', password='自己的数据库密码', port=3306, db='toutiao')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS toutiao_content(title VARCHAR(255) NOT NULL,user_name VARCHAR(255) NOT NULL, user_url VARCHAR(255) NOT NULL, source_url VARCHAR(255) NOT NULL, image0 VARCHAR(255) NOT NULL,image1 VARCHAR(255) NOT NULL,image2 VARCHAR(255) NOT NULL,PRIMARY KEY (title))'
cursor.execute(sql)
db.close()

完整代码：

from urllib.parse import urlencode
import time
from bson import ObjectId
import pymysql
from pymongo import MongoClient
import os
from hashlib import md5
import requests
from requests import codes
import urllib

#max_behot_time用于模拟分页，初值为0
max_behot_time=['0']
user={}

def get_page(j):
    params = {
        'category':'组图',
        'utm_source':'toutiao',
        'max_behot_time':j,
    }
    headers={
        'cookie':'csrftoken=9bac7d486ba3b65281cfd5122efddc4b; tt_webid=6667417260644681224; UM_distinctid=16971009c9125b-0f1082a5515cb5-b781636-144000-16971009ca73d0; tt_webid=6667417260644681224; WEATHER_CITY=%E5%8C%97%E4%BA%AC; s_v_web_id=781fb7db1a992686eab2ec15f487fd3c; login_flag=5163ce0786b07db2eabb5fefae066fd5; sessionid=01cb325d486a7fb9b2e8d1b28de78c7c; uid_tt=3f07060dc210570593a35c2d688b82d9; sid_tt=01cb325d486a7fb9b2e8d1b28de78c7c; sid_guard="01cb325d486a7fb9b2e8d1b28de78c7c|1552653698|15552000|Wed\054 11-Sep-2019 12:41:38 GMT"; __tasessionId=xry6oxv701552662761868; CNZZDATA1259612802=882188190-1552375846-%7C1552662342',
        'referer':'https://www.toutiao.com/ch/news_image/',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    base_url = 'https://www.toutiao.com/api/pc/feed/?'
    url = base_url + urlencode(params)
    try:
        resp = requests.get(url,headers=headers)
        if 200 == resp.status_code:
            response=resp.json()
        return response
    except requests.ConnectionError:
        return None

def get_max_behot_time(response):
    try:
        max_behot_time1 = response.get('next')
        max_behot_time2 = max_behot_time1.get('max_behot_time')
        #获取到下一个页面的max_behot_time2，将其加入列表max_behot_time,方便请求下一个页
        max_behot_time.append(max_behot_time2)
    except:
        #处理max_behot_time2为空的情况
        pass

def to_save(response):
    if response.get('data'):
        data = response.get('data')
        for item in data:
            image_list = []
            title = item.get('title')
            user_url='https://www.toutiao.com'+item.get('media_url')
            user_name=item.get('source')
            source_url='https://www.toutiao.com'+item.get('source_url')
            user['user_name']=user_name
            user['user_url']=user_url
            user['source_url']=source_url
            user['title']=title
            #'_id'是用来处理，存入mongodb报错的情况
            user['_id']=ObjectId()
            images = item.get('image_list')

            if images:
                for image in images:
                    image1=image.get('url_list')
                    if image1:
                        image_list.append(image1[0].get('url'))
                    else:
                        image2 = 'http:'+image.get('url')
                        image_list.append(image2)


            image1_sum=len(image_list)
            for i in range(image1_sum):
                image_name='image'+str(i)
                user[image_name]=image_list[i]
            #print(user)
            time.sleep(1.5)
            to_mysql(user)
            to_mongodb(user)
            to_local(user)



def to_mysql(user):
    """
    信息写入mysql
    """
    table='toutiao_content'
    keys =', '.join(user.keys())
    values = ', '.join(['%s'] * len(user))

    db = pymysql.connect(host='localhost', user='root', password='自己的数据库密码', port=3306, db='toutiao')
    cursor = db.cursor()
    sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
    try:
        if cursor.execute(sql, tuple(user.values())):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()


def to_mongodb(user):
    '''
    写入mongodb
    :param user: 今日头条信息字典
    '''
    client = MongoClient()
    db = client['toutiao']
    collection = db['toutiao_content']
    if collection.insert(user):
        print('Saved to Mongo!')


def to_local(user):
    '''
    写入本地目录
    '''
    #创建名为toutiao_img的主目录、和title内容为目录名的子目录
    table=str.maketrans(':.../|"" ','111111111')
    img_path = 'toutiao_img\\' + user['title'].translate(table)
    if not os.path.exists(img_path):
    	os.makedirs(img_path)
    try:
        #请求图片链接
        #遍历请求image0-image2的链接
        for i in range(0,3):
            image='image'+str(i)
            resp = requests.get(user[image])
            if codes.ok == resp.status_code:
                #图片内容使用其内容的MD5值，避免重复
                file_path = img_path + '\\' + '{file_name}.{file_suffix}'.format(
                    file_name=md5(resp.content).hexdigest(),
                    file_suffix='jpg')
                if not os.path.exists(file_path):
                    '''
                    #也可以用urllib的urlretrieve()方法下载图片
                    for i in range(0,3):
                        image='image'+str(i)
                        urllib.request.urlretrieve(user[image], file_path)
                    '''
                    with open(file_path, 'wb') as f:
                        f.write(resp.content)
                    print('Downloaded image path is %s' % file_path)
                else:
                    print('Already Downloaded', file_path)
    except requests.ConnectionError:
        print('Failed to Save Image，item')


if __name__ == '__main__':
    while True:
        i=len(max_behot_time)
        j=max_behot_time[i-1]
        #通过提取max_behot_time列表中的值，逐页发送请求
        response=get_page(j)
        time.sleep(1)
        to_save(response)
        get_max_behot_time(response)