ajax异步，多进程爬取股票信息，写入csv文件，10s大约能爬4000条数据

最新推荐文章于 2021-08-05 12:27:43 发布

mr_xinL

最新推荐文章于 2021-08-05 12:27:43 发布

阅读量474

点赞数

分类专栏：爬虫文章标签：多进程

本文链接：https://blog.csdn.net/mr_xinL/article/details/106018395

版权

爬虫专栏收录该内容

13 篇文章 0 订阅

订阅专栏

import csv
import requests
import json
import time
from multiprocessing import Pool
'''
  1、4核cpu，多进程大约能快5倍。
  '''

def getHtml(url):
    try:
        html=requests.get(url)
        html.raise_for_status()
        html.encoding="utf-8"
        return html.text
    except Exception as ex:
        print(ex)

def parseHtml(html):
    html=html.split("(",1)[1][:-2]
    html_content=json.loads(html)
    html_content=html_content["data"]["diff"]
    return html_content

def get_and_parse(page):
    # print(page,os.getpid())
    url = "http://46.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409929794146169659_1588917956370&pn=" + str(
        page+1) + "&pz=20&po=0&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f12&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1588917956427"
    html = getHtml(url)
    html_content = parseHtml(html)
    return html_content

def outputHtml(json_content):   #回调
    f=None
    header=json_content[0].keys()
    rows = json_content
    try:    #读取已有表格
        with open("C:\Dsoftdisk\python\scrap_example\\try.csv","a",newline="") as f:
            f_csv=csv.DictWriter(f,header)
            f_csv.writerows(rows)
    except Exception as ex:  # 新建表格
        print(ex)
    finally:            #这一步可不要
        f.close()
    '''
    为啥，csv不关闭还可以打开并写入
    知道的麻烦赐教一下
    '''

if __name__ == '__main__':
    print(time.asctime(time.localtime(time.time())))
    pp = Pool(4)
    for i in range(100):
        pp.apply_async(get_and_parse,args=(i,),callback=outputHtml)  #main的返回参数，传递给outputHtml,
    pp.close()
    pp.join()
    print("over", time.asctime(time.localtime(time.time())))