Python 爬金十数据

最新推荐文章于 2025-01-07 21:40:36 发布

赶在日落之前

最新推荐文章于 2025-01-07 21:40:36 发布

阅读量2.8k

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/lzz781699880/article/details/107084073

版权

爬虫专栏收录该内容

34 篇文章

订阅专栏

话不多说，直接上能用的代码

import requests
import  datetime
import pymysql
from requests.adapters import HTTPAdapter
import time

def conn():
    connect = pymysql.connect(host='', user='', password='', database='shipinformation',charset='utf8')
    if connect:
        print("连接成功!")
    return connect
conn = conn()
# cursor = conn.cursor()

##爬虫获取页面数据
url = "https://flash-api.jin10.com/get_flash_list"
header = {
    "x-app-id": "SO1EJGmNgCtmpcPF",
    "x-version": "1.0.0",
}
queryParam = {
    "max_time": "2021-05-25 9:47:02",
    "channel": "-8200",
}

#循环爬取并插入数据：结束条件是爬不到数据为止
totalCount = 0
Data = requests.get(url, queryParam, headers=header).json()['data']
length = len(Data)
while (length > 0):
    for i in range(length):
        try:
            id = Data[i]['id']
            time1 = Data[i]['time']
            create_time = datetime.datetime.strptime(time1, "%Y-%m-%d %H:%M:%S")
            type = Data[i]['type']
            if type == 0:
                if len(Data[i]['data']) > 2:
                    pic = Data[i]['data']['pic']
                    content = Data[i]['data']['content'].replace('<b>','').replace('</b>','').replace('<br />','').replace('<br/>','')
                    title = Data[i]['data']['title']
                elif len(Data[i]['data']) == 1:
                    pic = None
                    content = Data[i]['data']['content'].replace('<b>','').replace('</b>','').replace('<br />','').replace('<br/>','')
                    title = None
                else:
                    pic = Data[i]['data']['pic']
                    content = Data[i]['data']['content'].replace('<b>','').replace('</b>','').replace('<br />','').replace('<br/>','')
                    title = None
                update_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                print(id, time1, type, pic, content, title,update_time)
        except Exception as e:
            print(e)
            continue

    totalCount += length
    # 修正下一个查询时间
    queryParam['max_time'] = Data[length - 1]['time']
    print('next queryParam is', queryParam['max_time'])

    # 再请求一次数据
    try:
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=3))
        s.mount('https://', HTTPAdapter(max_retries=3))
        Data = requests.get(url, queryParam,timeout=5, headers=header).json()['data']
        length = len(Data)
    except Exception as e:
        print(e)


print('all ok,totalCount is:', totalCount)