# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import time import random import pymysql import json import pandas as pd from sqlalchemy import create_engine def randHeader(): ''' 随机生成User-Agent :return: ''' head_connection = ['Keep-Alive', 'close'] head_accept = ['text/html, application/xhtml+xml, */*'] head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3'] head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0' ] result = { 'Connection': head_connection[0], 'Accept': head_accept[0], 'Accept-Language': head_accept_language[1], 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))] } return result def getCurrentTime(): # 获取当前时间 return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())) def getURL(url, tries_num=5, sleep_time=0, time_out=10,max_retry = 5): ''' 这里重写get函数,主要是为了实现网络中断后自动重连,同时为了兼容各种网站不同的反爬策略及,通过sleep时间和timeout动态调整来测试合适的网络连接参数; 通过isproxy 来控制是否使用代理,以支持一些在内网办公的同学 :param url: :param tries_num: 重试次数 :param sleep_time: 休眠时间 :param time_out: 连接超时参数 :param max_retry: 最大重试次数,仅仅是为了递归使用 :return: response ''' header = randHeader() sleep_time_p = sleep_time time_out_p = time_out tries_num_p = tries_num try: res = requests.Session() if isproxy == 1: res = requests.get(url, headers=header, timeout=time_out, proxies=proxy) else: res = requests.get(url, headers=header, timeout=time_out) res.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常 except requests.RequestException as e: sleep_time_p = sleep_time_p + 10 time_out_p = time_out_p + 10 tries_num_p = tries_num_p - 1 # 设置重试次数,最大timeout 时间和 最长休眠时间 if tries_num_p > 0: time.sleep(sleep_time_p) print (getCurrentTime(), url, 'URL Connection Error: 第', max_retry - tries_num_p, u'次 Retry Connection', e) return getURL(url, tries_num_p, sleep_time_p, time_out_p,max_retry) return res class PyMySQL: # 获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())) # 数据库初始化 def _init_(self, host, user, passwd, db,port=3306,charset='utf8'): pymysql.install_as_MySQLdb() try: self.db =pymysql.connect(host=host,user=user,passwd=passwd,db=db,port=3306,charset='utf8') #self.db = pymysql.connect(ip, username, pwd, schema,port) self.db.ping(True)#使用mysql ping来检查连接,实现超时自动重新连接 print (self.getCurrentTime(), u"MySQL DB Connect Success:",user+'@'+host+':'+str(port)+'/'+db) self.cur = self.db.cursor() except Exception as e: print (self.getCurrentTime(), u"MySQL DB Connect Error :%d: %s" % (e.args[0], e.args[1])) # 插入数据 def insertData(self, table, my_dict): try: #self.db.set_character_set('utf8') cols = ','.join(my_dict.keys()) values = '","'.join(my_dict.values()) sql = "replace into %s (%s) values (%s)" % (table, cols, '"' + values + '"') #print (sql) try: result = self.cur.execute(sql) insert_id = self.db.insert_id() self.db.commit() # 判断是否执行成功 if result: #print (self.getCurrentTime(), u"Data Insert Sucess") return insert_id else: return 0 except Exception as e: # 发生错误时回滚 self.db.rollback() print (self.getCurrentTime(), u"Data Insert Failed: %s" % (e)) return 0 except Exception as e: print (self.getCurrentTime(), u"MySQLdb Error: %s" % (e)) return 0 class EastMoneyStock(): def getCurrentTime(self): # 获取当前时间 return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())) def getCurrentDate(self): # 获取当前时间 return time.strftime('%Y-%m-%d', time.localtime(time.time())) def getPortfolios(self,rank_type,url='rank.html',Type='smjs'): base_url='http://contest.eastmoney.com/'+Type+'/' rank_type=rank_type+Type.upper() fund_url=base_url+url res = getURL(fund_url) res.encoding = 'utf8' soup = BeautifulSoup(res.text, 'html.parser') rankList=soup.find_all("div",class_="data") uls=rankList[0].find_all("ul") portfolios=[] the_date=self.getCurrentDate() i=0 for ul in uls: if flag=='0': portfolios=[] portfolio={} i=i+1 portfolio['the_date']=the_date portfolio['rank_type']=rank_type try: portfolio['rank']=ul.find("li",class_="n1").text portfolio['portfolio_manager']=ul.find("li",class_="n2").text portfolio['portfolio_manager_url']=base_url+ul.find("li",class_="n2").a['href'] portfolio['portfolio_zjzh']=ul.find("li",class_="n3").a['href'].split('zjzh=')[-1] portfolio['portfolio_name']=ul.find("li",class_="n3").text portfolio['portfolio_url']=base_url+ul.find("li",class_="n3").a['href'] portfolio['total_return']= str(float(ul.find("li",class_="n4").text.strip('%'))/100) portfolio['week_return']=str(float(ul.find("li",class_="n5").text.strip('%'))/100) portfolio['daily_return']=str(float(ul.find("li",class_="n6").text.strip('%'))/100) if rank_type in ['AYS','AJS']: portfolio['asset_scale']=ul.find("li",class_="n7").text.replace('*','0') portfolio['concern_cnt']='0' else: portfolio['asset_scale']='0' portfolio['concern_cnt']=ul.find("li",class_="n7").text.replace('*','0') portfolio['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) portfolio['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) portfolio['data_source']='eastmoney' portfolios.append(portfolio) except Exception as e: print (self.getCurrentTime(),'getPortfolios', portfolio['the_date'],portfolio['rank_type'],portfolio['portfolio_url'],e ) try: if flag=='0': #index_data = pd.DataFrame(portfolios) #index_data.to_sql('em_stock_game_rank',engine,if_exists='append',index=False) mySQL.insertData('em_stock_game_rank', portfolio) print (self.getCurrentTime(),u'getPortfolios :',rank_type, portfolio['the_date'], portfolio['rank'],portfolio['portfolio_name'],portfolio['total_return'],portfolio['week_return'],portfolio['daily_return'],portfolio['asset_scale']) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild', portfolio['the_date'],portfolio['rank_type'],portfolio['portfolio_name'],portfolio['portfolio_url'],e ) try: pass self.getPortFolioHold(portfolio['portfolio_zjzh']) #print (self.getCurrentTime(),'getPortFolioHold ', portfolio['the_date'],portfolio['rank_type'],portfolio['portfolio_zjzh'],portfolio['portfolio_url']) except Exception as e: print (self.getCurrentTime(),'getPortFolioHold Faild', portfolio['the_date'],portfolio['rank_type'],portfolio['portfolio_url'],portfolio['portfolio_zjzh'],e ) try: if flag=='1': index_data = pd.DataFrame(portfolios) #print (index_data) index_data.to_sql('em_stock_game_rank',engine,if_exists='append',index=False) print (self.getCurrentTime(),u'getPortfolios :', the_date,rank_type, fund_url) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild', portfolio['the_date'],portfolio['rank_type'],portfolio['portfolio_url'],e ) return portfolios def getPortFolioHold(self,zjzh): reqUserId='1977113251641270' url='http://spdsqry.eastmoney.com/rtcs1?type=rtcs_zuhe_detail&khqz=121&zh='+zjzh+'&reqUserid=1977113251641270' res = getURL(url) res.encoding = 'utf8' json_str=json.loads(res.text) results=[] the_date=self.getCurrentDate() detail =json_str['data']['detail'] try: result={} userid=detail['userid'] result['userid']=userid result['zjzh']=detail['zjzh'] result['start_date']= detail['startDate'] result['concern_cnt']=detail['concernCnt'] result['rate5DayDP']=detail['rate5DayDP'] result['vFlag']=detail['vFlag'] result['uidComment']=detail['uidComment'] result['rateMaxStk']= detail['rateMaxStk'] result['ranking']= detail['ranking'] result['visitors']=detail['visitors'] result['dealWinCnt']=detail['dealWinCnt'] result['rate20DayDP']=detail['rate20DayDP'] result['isHisFlag']=detail['isHisFlag'] result['updateDate']=detail['updateDate'] result['rateDP']=detail['rateDP'] result['ranking5Day']=detail['ranking5Day'] result['rate20Day']=detail['rate20Day'] result['rate250Day']=detail['rate250Day'] result['comment']=detail['comment'] result['linkurl']=detail['linkurl'] result['indexCode']=detail['indexCode'] result['winTrialMoney']=detail['winTrialMoney'] result['picurl']=detail['picurl'] result['rate60Day']=detail['rate60Day'] try: result['prtitnMoney']=detail['prtitnMoney'] except Exception as e: result['prtitnMoney']='0' result['JZ']=detail['JZ'] result['rate250DayDP']=detail['rate250DayDP'] result['vType']=detail['vType'] result['vTypeStatus']=detail['vTypeStatus'] result['rateDayDP']=detail['rateDayDP'] result['content']=detail['content'] result['rateDay']=detail['rateDay'] result['rate60DayDP']=detail['rate60DayDP'] result['zuheName']=detail['zuheName'] result['portfRat']=detail['portfRat'] result['dealRate']=detail['dealRate'] result['rateMaxStkName']=detail['rateMaxStkName'] result['uidNick']=detail['uidNick'] result['dealfailCnt']=detail['dealfailCnt'] result['rankingDay']=detail['rankingDay'] result['dealfailCnt']=detail['dealfailCnt'] result['title']=detail['title'] result['rateMaxStkCode']=detail['rateMaxStkCode'] result['rate5Day']=detail['rate5Day'] result['rate']=detail['rate'] result['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['data_source']='eastmoney' results.append(result) if flag=='0': mySQL.insertData('em_stock_game_details', result) if flag=='1': index_data = pd.DataFrame(results) index_data.to_sql('em_stock_game_trade_details',engine,if_exists='append',index=False) #print (self.getCurrentTime(),'getPortFolioHold details Success ',zjzh,userid) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild',zjzh,e) try: tendencys = json_str['data']['tendency'] results=[] for tendency in tendencys: result={} result['zjzh'] = zjzh result['userid']=userid result['the_date'] = tendency['yk_date'] result['total_rate']= tendency['totalRate'] result['index_rate'] = tendency['indexRate'] result['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['data_source']='eastmoney' results.append(result) if flag=='0': mySQL.insertData('em_stock_game_tendency', result) if flag=='1': index_data = pd.DataFrame(results) index_data.to_sql('em_stock_game_tendency',engine,if_exists='append',index=False) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild',zjzh,e) try: results=[] stkholds = json_str['data']['stkhold'] for stkhold in stkholds: result={} result['zjzh'] = zjzh result['userid']=userid result['the_date']=the_date result['fullcode']=stkhold['fullcode'] result['cost_price']= stkhold['cbj'] result['hold_positon']= stkhold['holdPos'] result['stkmkt_code']=stkhold['stkMktCode'] result['stock_code']= stkhold['__code'] result['stock_name']=stkhold['__name'] result['close_price']=stkhold['__zxjg'] result['urcg_rate']=stkhold['webYkRate'] result['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['data_source']='eastmoney' results.append(result) if flag=='0': mySQL.insertData('em_stock_game_stkhold', result) if flag=='1': index_data = pd.DataFrame(results) index_data.to_sql('em_stock_game_stkhold',engine,if_exists='append',index=False) #print (self.getCurrentTime(),'getPortFolioHold stkhold Success ',zjzh) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild',zjzh,e) try: results=[] holds = json_str['data']['hold'] #调仓明细 for hold in holds: result={} result['userid']=userid result['fullcode']= hold['fullcode'] result['bs_flag']=hold['mmbz']#买卖标志 result['delegate_state']=hold['wtzt']#委托价格 result['delegate_time']= hold['wtsj']#委托时间 result['trans_price']= hold['cjjg']#成交价格 result['trans_date']=hold['cjrq']#成交日期 result['trans_time']=hold['cjsj']#成交时间 result['trans_positon']=hold['cw']#仓位 result['stock_name']=hold['stkName']#股票名称 result['stkmkt_code']=hold['stkMktCode']#股票代码 result['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['data_source']='eastmoney' results.append(result) if flag=='0': mySQL.insertData('em_stock_game_hold', result) if flag=='1': index_data = pd.DataFrame(results) index_data.to_sql('em_stock_game_hold',engine,if_exists='append',index=False) #print (self.getCurrentTime(),'getPortFolioHold hold Success ',zjzh) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild',zjzh,e) try: results=[] tradeSummarys = json_str['data']['tradeSummary'] for tradeSummary in tradeSummarys: result={} result['userid']=userid result['fullcode']= tradeSummary['fullcode'] result['stock_name']=tradeSummary['stkName']#股票名称 result['stkmkt_code']=tradeSummary['stkMktCode']#股票代码 result['the_date']=tradeSummary['tzrq']#投资日期 result['lshj_mr']=tradeSummary['lshj_mr']#历史合计买入 result['cwhj_mr']= tradeSummary['cwhj_mr']#仓位合计买入 result['cjjg_mr']= tradeSummary['cjjg_mr']#成交价格买入 result['lshj_mc']=tradeSummary['lshj_mc']#成交日期卖出 result['cwhj_mc']=tradeSummary['cwhj_mc']#仓位合计卖出 result['cjjg_mc']=tradeSummary['cjjg_mc']#成交价格卖出 result['created_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result['data_source']='eastmoney' results.append(result) if flag=='0': mySQL.insertData('em_stock_game_trade_summary', result) if flag=='1': index_data = pd.DataFrame(results) index_data.to_sql('em_stock_game_trade_summary',engine,if_exists='append',index=False) #print (self.getCurrentTime(),'getPortFolioHold tradeSummary Success ',zjzh) #print (json_str['data']['hold']) #print (json_str['data']['pieplot']) except Exception as e: print (self.getCurrentTime(),'getPortfolios insertData Faild',zjzh,e) def getPortfolioList(self): for i in range(1,51): url_total='rank_'+str(i)+'.html' url_daily='rank_r_'+str(i)+'.html' url_gf='rank_gf_'+str(i)+'.html' url_xj='rank_xj_'+str(i)+'.html' #2017 三月份 self.getPortfolios('T',url_total,'smjs') self.getPortfolios('D',url_daily,'smjs') self.getPortfolios('A',url_xj,'smjs') self.getPortfolios('T',url_total,'smys') self.getPortfolios('D',url_daily,'smys') self.getPortfolios('A',url_gf,'smys') #2017 二月份 self.getPortfolios('T',url_total,'dmjs') self.getPortfolios('D',url_daily,'dmjs') self.getPortfolios('A',url_xj,'dmjs') self.getPortfolios('T',url_total,'dmys') self.getPortfolios('D',url_daily,'dmys') self.getPortfolios('A',url_gf,'dmys') def main(): global mySQL, sleep_time, isproxy, proxy,engine,flag mySQL = PyMySQL() fundSpiders=EastMoneyStock() flag='0' #mySQL._init_('118.89.141.224', 'root', 'root', 'invest') #mySQL._init_('10.20.11.239', 'root', 'root', 'invest','3306') engine=create_engine("mysql+pymysql://root:root@localhost:3306/invest?charset=utf8") mySQL._init_('localhost', 'root', 'root', 'invest') isproxy = 0 # 如需要使用代理,改为1,并设置代理IP参数 proxy proxy = {"http": "http://10.37.84.147:8080", "https": "http://10.37.84.147:8080"}#这里需要替换成可用的代理IP sleep_time = 0.01 funds=fundSpiders.getPortfolioList() if __name__ == "__main__": main()