下载日成交详单到mongodb

最新推荐文章于 2022-04-06 19:57:55 发布
xp5xp6
最新推荐文章于 2022-04-06 19:57:55 发布
阅读量352
点赞数
分类专栏： python
本文链接：https://blog.csdn.net/xp5xp6/article/details/53130297
版权
python 专栏收录该内容
101 篇文章 2 订阅
订阅专栏
#! /usr/bin/python2
# coding=utf-8
import urllib2
import csv
import time
import pymongo
import xlrd
import xlwt
import sys
import mechanize

from datetime import datetime
from datetime import timedelta

global stocks
global detail
global list_code
global conn_list, conn_detail
list_code = []


def init_mongodb_list():
    global stocks
    global conn_list
    # mongodb_link = 'mongodb://127.0.0.1:27017'
    # mongoClient = MongoClient(mongodb_link)
    conn_list = pymongo.MongoClient("localhost", 27017)
    conn_list.stock_list.authenticate("d", "zz")
    db = conn_list.stock_list
    stocks = db.stocks
    # return stocks


def init_mongodb_day_detail():
    global detail
    global conn_detail
    conn_detail = pymongo.MongoClient("localhost", 27017)
    conn_detail.day_detail.authenticate("d", "zz")
    db = conn_detail.day_detail
    detail = db.detail


def get_newst_date(code):
    date = ""
    time1 = "00:00:00"
    l = detail.find({"code": code}).sort([('date', -1), ('time', -1)]).limit(1)
    for i in l:
        code = i.get("code")
        date = i.get("date")
        time1 = i.get("time")
        print code, date, time1
    # print l.get("date")
    # print code, date,"----------------------------------------"

    return date, time1


# sina

def update_ri(code, date, time, price, price_fluncuation, volume, turnover, nature):
    print "sql", code, date, time, price, price_fluncuation, volume, turnover, nature
    detail.update({"code": code, "date": date, "time": time}, {"$setOnInsert":{"price": price, "price_fluncuation": price_fluncuation,
                                                                        "volume": volume, "turnover": turnover,
                                                                        "nature": nature}}, upsert = True)






def get_stock_list():
    l = stocks.find()
    #j = 0
    for i in l:
        code = i.get("code")
        name = i.get("name")
        date_start = i.get("date_start")
        #print code,name,date_start,len(code),len(name),len(date_start)
        #date_start1 = "1999-11-10"
        #print len(date_start),len(date_start1), date_start1, date_start
        list_code.append({"code": code, "name": name, "date_start":date_start})
        #print list_code[j]["date_start"], len(list_code[j]["date_start"])
        #j+=1
        # print type(name), name, type(code),code
        #print list_code


def day_plus(str):
    # now = datetime.now()
    day = datetime.strptime(str, "%Y-%m-%d")
    day_diff = timedelta(days=1)
    day = day + day_diff
    return day

def day_plus(str):

    day = datetime.strptime(str, "%Y-%m-%d")
    return day

def day_plus1(day):
    day_diff = timedelta(days=1)
    day = day + day_diff
    return day


def day_str_change(str):
    day = datetime.strptime(str, "%Y-%m-%d")
    return day.strftime('%Y%m%d')



def stock_header(code):
    url = '&symbol='
    t1 = ('60', '900')
    t2 = ('000', '002', '300', '200')
    t3 = ('399001', '399006')
    if code.startswith(t1):
        str = 'sh' + code
    #elif code.startswith('000001'):
        #str = '0' + code
    #elif code.startswith(t3):
        #str = '1' + code
    elif code.startswith(t2):
        str = 'sz' + code
    else:
        str = code
        print code
    url = url + str
    return url





def deal_url(str_day, url):
    if ("" == str_day):
        print url
        return url;
    # sina data 成交时间 成交价 价格变动 成交量 成交额 性质
    str_url = "http://market.finance.sina.com.cn/downxls.php?date="  + str_day  + url

    #print str_url
    return str_url




def file_to_sql(code, date, content):
    #print content

    #print "--------------"
    rows = content.split('\n')
    rows.sort()
    #print rows
    i = 0
    cnt = len(rows) - 1
    for row in rows:
        # 忽略第一行
        if ((i == 0) or (i == cnt)):
            i += 1
            print "d", i
            continue

        split_row = row.split("\t")
        #full_data = []
        #for row_s in split_row:
        #    full_data.append(row_s)
        #print split_row,"--------------------"
        #print "-------------"
        # split_row[1] = int(split_row[1])
        # full_data.append(split_row)
        # print "-----------------type--",type(split_row), split_row
        #print split_row[1],split_row[0]
        try:
            # yahoo
            full_data = []
            for row_s in split_row:
                #print row_s
                str = row_s.strip().replace('--', '0')
                #print "str",str
                full_data.append(str)
            #print full_data,len(full_data)
            if (6 != len(full_data)):
                continue
            #if (False == full_data[2].isdigit()):
            #    full_data[2] = '0'
            #print full_data, len(full_data)
            str_nature = full_data[5].decode('gb18030')
            #print str_nature.encode("utf8")
            #print code, date, full_data[0], float(full_data[1]), float(full_data[2]), float(full_data[3]), float(full_data[4]), full_data[5]
            update_ri(code, date, full_data[0], float(full_data[1]), float(full_data[2]), float(full_data[3]), float(full_data[4]), str_nature.encode("utf8"))
            # 163
            '''
            full_data = []
            for row_s in split_row:
                # print row_s, len(row_s)
                str = row_s.replace("\r", '').replace('None', '0')
                # print str
                if ('' == str):
                    # print "kong"
                    str = '0'

                # print row_s
                # print "--------------------"
                full_data.append(str)
                # print row_s,"---"
                # print code, len(full_data)
                # print full_data
            # print split_row,i

            if (16 != len(full_data)):
                break
            # print "full",full_data
            update_ri(code, full_data[0], float(full_data[3]), float(full_data[4]), float(full_data[5]),
                      float(full_data[6]), \
                      float(full_data[7]), float(full_data[8]), float(full_data[9]), float(full_data[10]),
                      float(full_data[11]), \
                      float(full_data[12]), float(full_data[13]), float(full_data[14]), float(full_data[15]))
'''

        except ValueError:
            print '\033[1;31;40m'
            print split_row
            print "--------------------ValueError----------------------------------------------"
            print '\033[0m'
            continue

        i += 1
'''
def get_day(code, url):
    print code, url
    if ("" == url):
        print "---newst---date---------------------------------"
        return
    # url = 'http://quotes.money.163.com/service/chddata.html?code=1000002'
    # url = 'http://quotes.money.163.com/service/chddata.html?code=0601398&start=20000720&end=20150508'
    # url = 'http://table.finance.yahoo.com/table.csv?s=000002.sz'
    # url = 'http://table.finance.yahoo.com/table.csv?s=000002.sz&d=6&e=22&f=2006&g=d&a=11&b=16&c=1991&ignore=.csv'
    # url = 'http://hq.sinajs.cn/?list=sh600127'
    # http://market.finance.sina.com.cn/downxls.php?date=2016-10-28&symbol=sz300127
    # print url
    req_header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'gzip', 'Connection': 'close', 'Referer': None  # 注意如果依然不能抓取的话，这里可以设置抓取网站的host
        }
    req_timeout = 500
    req = urllib2.Request(url,None,req_header)
    #req = urllib2.Request(url)
    # print req
    # 如果不需要设置代理，下面的set_proxy就不用调用了。由于公司网络要代理才能连接外网，所以这里有set_proxy…
    # req.set_proxy('proxy.XXX.com:911', 'http')
    # socket = urllib2.urlopen(req,None,req_timeout)
    import pandas as pd
    try:
        socket = urllib2.urlopen(req, None, req_timeout)
        # print socket
        content = socket.read()
        # content = socket.read().decode('GB18030')
        socket.close()


    except urllib2.HTTPError, e:
        print '\033[1;31;40m'
        print 'The server couldn\'t fulfill the request.'
        print 'Error code: ', e.code
        print 'Error reason: ', e.reason
        print '\033[0m'
    except urllib2.URLError, e:
        print '\033[1;31;40m'
        print 'We failed to reach a server.'
        print 'Reason: ', e.reason
        print '\033[0m'
    else:
        # everything is fine
        file_to_sql(code, content)
        #print type(content)
        #read_excel(content)
'''





class NoHistory(object):
    def add(self, *a, **k): pass
    def clear(self): pass

def browser(url):
    print url
    if ("" == url):
        print "---newst---date---------------------------------"
        return ""
    br = mechanize.Browser(history=NoHistory())
    #options
    br.set_handle_equiv(True)
    #br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    #Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.set_debug_http(True)
    br.set_debug_redirects(True)
    br.set_debug_responses(True)

    #欺骗行为
    br.addheaders = [('User-agent', 'Mozilla/6.0 (X11; U; Linux i686; en-US; rv:1.9.0.2) Gecko/2008071616 Fedora/3.0.1-2.fc9 Firefox/3.0.2')]

    #上面的代码主要用于初始化设置，最好设置一下
    try:

        # 打开百度
        r = br.open(url)
        #获取百度的表单，从中找到输入汉字的位置
        '''
        for f in br.forms():
            print f

        br.select_form(nr = 0)

        #搜索关键字“火车”
        br.form['wd'] = "火车"
        br.submit()
        # 查看搜索结果
        brr=br.response().read()
        #是html代码，能看到火车的搜索结果
        print brr
        '''
        content = r.read().strip()
        #print content
        '''
        rows = content.split('\n')
        rows.sort()
        print rows
        print "---------------"
        print len(rows)
        i = 0
        cnt = len(rows) - 1
        for row in rows:
            print row
        print cnt
        '''
    except:
        print '\033[1;31;40m'
        print "open err ------------------------------------"
        print '\033[0m'
        return ""
    return content
def get_day_list(code, date_start):
    str_day, time_day = get_newst_date(code)
    print str_day
    url = stock_header(code)
    # print url
    now = datetime.now()
    str_now = now.strftime("%Y-%m-%d")
    #time_now = now.strftime("%H:%M:%S")

    #str_day="2015-11-11"
    print "start date ",date_start,len(date_start)
    if ("" == str_day):
        day = datetime.strptime(date_start, "%Y-%m-%d")
    else:
        day = datetime.strptime(str_day, "%Y-%m-%d")
    if ("15:00:00" <= time_day):
        day = day_plus1(day)
        print str_day,time,date_start

    while (day.strftime("%Y-%m-%d") <= str_now):
        week = day.weekday()
        if ((5 == week) or (6 == week)):
            day = day_plus1(day)
            continue
        str_day = day.strftime("%Y-%m-%d")
        url_all = deal_url(str_day, url)
        # print url
        #get_day(code, url_all)
        content = browser(url_all)
        # print content
        if ("" != content):
            file_to_sql(code, str_day, content)
        day = day_plus1(day)


if __name__ == '__main__':
    init_mongodb_list()
    print stocks.count()
    get_stock_list()
    conn_list.close()
    # print list_code
    init_mongodb_day_detail()
    # get_code_k_ri()
    print len(list_code)
    #for i in list_code:
    #    print i["code"],i["date_start"]

    for code in list_code:
        get_day_list(code["code"], code["date_start"])


    conn_detail.close()