爬取雪球网n页数据
用到 与mysql数据库的交互
# 爬取雪球网n页数据
# 用到 与mysql数据库的交互
import requests
import json
import pymysql
class mysql_conn(object):
# 魔术方法, 初始化, 构造函数
def __init__(self):
self.db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306, database='py666')
self.cursor = self.db.cursor()
# 执行modify(修改)相关的操作
def execute_modify_mysql(self, sql):
self.cursor.execute(sql)
self.db.commit()
# 魔术方法, 析构化 ,析构函数
def __del__(self):
self.cursor.close()
self.db.close()
# 构造data
def godata(total):
data = {}
data['id'] = total['id']
data['title'] = pymysql.escape_string(total['title'])
data['description'] = pymysql.escape_string(total['description'])
data['target'] = pymysql.escape_string(total['target'])
return data
# 将所需数据遍历存入mysql
def listgomysql(res_list):
for i in res_list:
# 获取所需数据
total = json.loads(i['data'])
# 构造data
data = godata(total)
# 定义sql操作语句
sql = "insert into biao(uid,title,description,target) values ('%s','%s','%s','%s')" % (
data['id'], data['title'], data['description'], data['target'])
# sql = "insert into biao values('{id}','{title}','{description}','{target}')".format(**data)
# 实例化 mysql_conn对象
mc = mysql_conn()
# 调用方法
mc.execute_modify_mysql(sql)
# 将页数n作为参数传入,add函数完成获取存入操作
def add(n):
next_max_id='-1'
k=0
while k < n:
url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id='+next_max_id+'&count=10&category=-1'
headers = {
'Cookie': 'aliyungf_tc=AQAAAPxRByb/ngsAuACIdd7Bn0Lxsxdh; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; u=141534315699746; device_id=fb51fe05bac8b87de1e093e72f85bd6b; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534315700,1534315713; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534315713',
'Referer': 'https://xueqiu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
# 获取数据
response = requests.get(url,headers=headers)
# str---dict
res_dict=json.loads(response.text)
# 获取下一页 next_max_id
next_max_id=res_dict['next_max_id']
# 获取所需数据
res_list = res_dict['list']
# 将所需数据遍历存入mysql
listgomysql(res_list)
next_max_id=str(next_max_id)
k += 1
add(10)