import requests
from bs4 import BeautifulSoup as bs
import datetime
import json
import re
import multiprocessing as mp
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
第一种:返回的是json格式,直接解析
while True:
try:
r = requests.get("http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?select=code%2Cprev_close&order=&begin=0&end=1500", headers=headers).text
text = json.loads(r)
cnt = text["list"]
break
except:
continue
可以用循环来过滤一两次爬取失败。
第二种:返回html,用BeautifulSoup解析
r = requests.get(url, headers=gheaders).content
content = bs(r, "html.parser", from_encoding='utf-8')
text = content.find("table", attrs={"class": "quote-info"})
tds = text.find_all("td")
p1 = str(tds[6].find("span", class_="stock-fall").text)
p2 = str(tds[2].find("span", class_="stock-rise").text)
另外可以用多进程并行爬取:
def get_close(code):
r = requests.get(b'http://www.szse.cn/api/market/ssjjhq/getTimeData?marketId=1&code=%s' % code, headers=headers).text
text = json.loads(r)
px = str(text["data"]["close"])
return code+","+px+"\n"
syms = get_list()
res = []
nProcess = 2*mp.cpu_count()/3
if nProcess > 1:
pool = mp.Pool(nProcess)
res = pool.map(get_close, syms)
pool.close()
pool.join()
else:
res = map(get_close, syms)