1.重要步骤
查找内容,发现位于js响应中
可以使用工具查看js response内容
从header获取动态网页的请求来源
2.完整代码
import requests as req
import re
import csv
import pymysql
from lxml import etree
import json
class SSESpider(object):
"""爬取上交所证券数据"""
def __init__(self):
self.data = dict()
self.crawl_result = ""
self.parse_result = dict()
def crawl(self, start):
""" 需要的response 在JS中"""
url = "http://query.sse.com.cn/security/stock/getStockListData2.do?&jsonCallBack=jsonpCallback15988&isPagination=true&stockCode=&csrcCode=&areaName=&stockType=1&pageHelp.cacheSize=1&pageHelp.beginPage={}&pageHelp.pageSize=25&pageHelp.pageNo=1&_=1624950555991".format(start)
# print(url)
headers = {
"Referer": "http://www.sse.com.cn/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
resp = req.get(url, headers=headers)
content = resp.content.decode("utf-8")
print("content type:", type(content))
return content
pass
def parse(self, content):
""" 解析数据 """
# 可使用工具查看json内容解析
result = re.match(r'.*?\((.*?)\)', content).group(1)
dict_all = json.loads(result, encoding='utf-8')
return dict_all["result"]
def savejson(self, dict_data):
""" 将字典写入.json文件"""
if len(dict_data) > 0:
with open("stocks.json", "a", encoding="utf-8", newline="") as fd:
json.dump(dict_data, fd, indent=2, ensure_ascii=False)
pass
def start(self):
"""启动爬虫"""
spider = SSESpider()
dict_all = {"data": []}
for i in range(1, 1000):
crawl_rst = spider.crawl(i)
print(crawl_rst, "\n\n")
parse_rst = spider.parse(crawl_rst)
if len(parse_rst) == 0:
break
dict_all['data'].extend(parse_rst)
self.savejson(dict_all)
pass
pass
if __name__ == "__main__":
spider = SSESpider()
spider.start()