【spider requests】

import re
import sqlite3
import time

import pandas as pd
import requests
from time import sleep
import json
from datetime import datetime

requests.packages.urllib3.disable_warnings()


def get_start(conn, table_name):
    start = -1
    try:
        df2 = pd.read_sql_query(f'''select * from "{table_name}" where "已确认"='1' ''', con=conn)
        ind=[int(i) for i in list(df2['索引'])]
        start = max(ind)
    except:
        pass
    return start


def test_ready():
    try_times = 0
    for try_times in range(9999):
        tem_url = base_url + "&k=人间烟火"
        try:
            response = requests.request("GET", tem_url, headers=headers, data={}, timeout=15)
        except:
            sleep(2)
            print("获取失败,正在重试...")
        song_json = response.json()
        if 'singer' in str(song_json):
            break
        else:
            print(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
            sleep(30)
    return try_times


def update_ready(table_name, i):
    index = ','.join([f'{i}' for i in range(i - batch, i)])
    query = f'''update "{table_name}" set "已确认"='1' where  "索引" in ({index} )'''
    cursor=conn.cursor()
    cursor.execute(query)
    cursor.close()
    conn.commit()


def delete_ready(table_name):
    try:
        query = f'''delete from "{table_name}"  where "已确认"='0' '''
        cursor = conn.cursor()
        cursor.execute(query)
        cursor.close()
        conn.commit()
    except Exception as e:
        print(e)




conn = sqlite3.connect("source.db")
headers = {
    'Referer': '',
    'User-Agent': '',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json;charset=UTF-8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=60',
    'Accept': '*/*',
    'Host': '',
}
base_url = ""
batch = 100



def main():
    test_ready()
    df = pd.read_sql_query("select * from source", con=conn)
    table_name = '歌单获取2'
    delete_ready(table_name)
    start = get_start(conn, table_name)
    df = df.astype(str)
    for i, row in df.iterrows():
        if i <= start:
            continue
        if i != 0 and i % batch == 0:
            try_times = test_ready()
            if try_times:
                main()
            else:
                update_ready(table_name, i)
        song = row['songname'] + row['choric_singer']
        
        status = 0
        url = base_url + f"&k={song}"
        while not status:
            try:
                response = requests.request("GET", url, headers=headers, data={}, timeout=15)
                status = response.status_code
            except:
                sleep(2)
                print("获取失败,正在重试...")
            if status != 200:
                song = re.sub(r'[()()\\"\'\s]', '', song)
                url = base_url + f"&k={song}"
                response = requests.request("GET", url, headers=headers, data={}, timeout=15)
                status = response.status_code
                if status != 200:
                    continue
            song_json = response.json()
            dic = {"索引": [i], "搜索词": [song], "结果": [str(song_json)], "已确认": ['0']}
            print(i, song, str(song_json))
            song_df = pd.DataFrame(dic)
            song_df = song_df.astype(str)
            if i == 0:
                song_df.to_sql(name="歌单获取2", con=conn, if_exists="replace", index=False)
            else:
                song_df.to_sql(name="歌单获取2", con=conn, if_exists="append", index=False)
            # sleep(1)
    try_times = test_ready()
    if try_times:
        main()
    else:
        update_ready(table_name, i)
    exit()


if __name__ == '__main__':
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值