爬雪球前三页数据存储到数据库

import requests
import json
import pymysql

#封装个连接mysql的类
class mysql_conn():
    def __init__(self):
        self.db = pymysql.connect('127.0.0.1','root','******','wang')
        self.cursor = self.db.cursor()
    def execute_modify_mysql(self,sql):
        self.cursor.execute(sql)
        self.db.commit()
    def __del__(self):
        self.db.close()

# 封装爬取一页的信息
def xueqiu(url):
    res_list = []
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'

    headers = {
        'Cookie': 'aliyungf_tc=AQAAAHEe4kB1aggAUhVFeSTHOQA3F9Tr; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; u=591534314233508; device_id=6cc7e6153f6fc5c2ee23a704fa3cfc88; _ga=GA1.2.1900937437.1534314236; _gid=GA1.2.501674476.1534314236; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534314242,1534314439,1534314451; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534314451',
        'User-Agent': user_agent
    }
    response = requests.get(url,headers=headers)
    html_byte = json.loads(response.text)
    for html_list in html_byte['list']:
        res_list.append(html_list)
    return res_list

if __name__ == '__main__':
    sq = mysql_conn()
    #页码列表  这里只有前三页  如需爬取多页 直接取上一页的max_id拼接起来就行
    url_list = ['https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111',
                'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184255&count=15&category=111',
                'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184006&count=15&category=111']
    for url in url_list:
        res_list = xueqiu(url)
        for line_data in res_list:
            id = json.loads(line_data['data'])['id']
            title = json.loads(line_data['data'])['title']
            description = json.loads(line_data['data'])['description']
            target = json.loads(line_data['data'])['target']
            sql = "insert into xueqiu(xid,title,description,target) values('{}','{}','{}','{}')".format(id,title,description,target)
            # print(sql)
            sq.execute_modify_mysql(sql)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
根据提供的引用内容,你可以使用以下步骤来雪球网站的数据: 步骤1: 导入所需的库 ```python import pandas as pd from bs4 import BeautifulSoup import re import requests import json ``` 步骤2: 定义下载网页的方法 ```python def download_page(url, para=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59' } if para: response = requests.get(url, params=para, headers=headers) else: response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding if response.status_code == 200: return response.text else: print("failed to download the page") ``` 步骤3: 解析网页并提取所需数据 ```python def parse_page(html): # 使用BeautifulSoup解析网页 soup = BeautifulSoup(html, 'html.parser') # 根据网页结构和需要的数据,使用CSS选择器或正则表达式进行提取 # 例如: # title = soup.select('.title')[0].text # content = soup.select('.content')[0].text # 返回提取的数据 # return title, content ``` 步骤4: 执行虫 ```python def run_spider(): url = 'https://xueqiu.com' # 根据网页结构和需要的参数,构造请求参数 # 例如: # para = {'param1': 'value1', 'param2': 'value2'} # 下载网页 html = download_page(url, para) # 解析网页并提取数据 # title, content = parse_page(html) # 输出提取的数据 # print('Title:', title) # print('Content:', content) ``` 请注意,这只是一个简单的示例,你可能需要根据雪球网站的具体结构和需求进行适当的修改和调整。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值