python 获取 中国证券网 的公告

原文地址: http://www.30daydo.com/article/59


中国证券网:  http://ggjd.cnstock.com/
这个网站的公告会比同花顺东方财富的早一点,而且还出现过早上中国证券网已经发了公告,而东财却拿去做午间公告,以至于可以提前获取公告提前埋伏。
 
生成的公告保存在stock/文件夹下,以日期命名。 下面脚本是循坏检测,如果有新的公告就会继续生成。
 
默认保存前3页的公告。(一次过太多页会被网站暂时屏蔽几分钟)。 代码以及使用了切换header来躲避网站的封杀。
 
修改
getInfo(3) 里面的数字就可以抓取前面某页数据



__author__ = 'rocchen'
# working v1.0
from bs4 import BeautifulSoup
import urllib2, datetime, time, codecs, cookielib, random, threading
import os,sys


def getInfo(max_index_user=5):
    stock_news_site = "http://ggjd.cnstock.com/gglist/search/ggkx/"
    my_userAgent = [
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)']
    index = 0
    max_index = max_index_user
    num = 1
    temp_time = time.strftime("[%Y-%m-%d]-[%H-%M]", time.localtime())

    store_filename = "StockNews-%s.log" % temp_time
    fOpen = codecs.open(store_filename, 'w', 'utf-8')

    while index < max_index:
        user_agent = random.choice(my_userAgent)
        # print user_agent
        company_news_site = stock_news_site + str(index)
        # content = urllib2.urlopen(company_news_site)
        headers = {'User-Agent': user_agent, 'Host': "ggjd.cnstock.com", 'DNT': '1',
                   'Accept': 'text/html, application/xhtml+xml, */*', }
        req = urllib2.Request(url=company_news_site, headers=headers)
        resp = None
        raw_content = ""
        try:
            resp = urllib2.urlopen(req, timeout=30)

        except urllib2.HTTPError as e:
            e.fp.read()
        except urllib2.URLError as e:
            if hasattr(e, 'code'):
                print "error code %d" % e.code
            elif hasattr(e, 'reason'):
                print "error reason %s " % e.reason

        finally:
            if resp:
                raw_content = resp.read()
                time.sleep(2)
                resp.close()

        soup = BeautifulSoup(raw_content, "html.parser")
        all_content = soup.find_all("span", "time")

        for i in all_content:
            news_time = i.string
            node = i.next_sibling
            str_temp = "No.%s \n%s\t%s\n---> %s \n\n" % (str(num), news_time, node['title'], node['href'])
            #print "inside %d" %num
            #print str_temp
            fOpen.write(str_temp)
            num = num + 1

        #print "index %d" %index
        index = index + 1

    fOpen.close()


def execute_task(n=60):
    period = int(n)
    while True:
        print datetime.datetime.now()
        getInfo(3)
        
        time.sleep(60 * period)
        


if __name__ == "__main__":

    sub_folder = os.path.join(os.getcwd(), "stock")
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)
    os.chdir(sub_folder)
    start_time = time.time()  # user can change the max index number getInfo(10), by default is getInfo(5)
    if len(sys.argv) <2:
        n = raw_input("Input Period : ? mins to download every cycle")
    else:
        n=int(sys.argv[1])
    execute_task(n)
    end_time = time.time()
    print "Total time: %s s." % str(round((end_time - start_time), 4))



求star
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值