1.提取详情url:
from lxml import etree
from selenium import webdriver
import time
import pymysql
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class fundSpider(object):
def __init__(self,starturl,type):
# 开始url
self.url = starturl
self.type = type
# 无界面模式
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
def run(self):
time.sleep(5)
self.driver.get(self.url)
source = self.driver.page_source
html = etree.HTML(source)
# 先得到body下所有的tr
trList = html.xpath("//table[@id='dbtable']/tbody/tr")
urls = []
trList = trList[0:30]
for tr in trList:
code = tr.xpath("./td/a/text()")[0]
print(code)
urls.append((code,self.type))
self.saveurls(self, urls)
print(str(self.type) + '完成')
def getUrl(self):
urls = []
self.driver.get(self.url)
time.sleep(5)
# 近一月
jyy = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='1yzf']/a")
jyy.click()
time.sleep(5)
urls.append(self.driver.current_url)
# 近三月
jsy = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='3yzf']/a")
jsy.click()
time.sleep(5)
urls.append(self.driver.current_url)
# 近六月
jly = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='6yzf']/a")
jly.click()
time.sleep(5)
urls.append(self.driver.current_url)
# 近一年
jyn = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='1nzf']/a")
jyn.click()
time.sleep(5)
urls.append(self.driver.current_url)
# 近两年
jln = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='2nzf']/a")
jln.click()
time.sleep(5)
urls.append(self.driver.current_url)
# 近三年
jsn = self.driver.find_element_by_xpath("//table[@id='dbtable']/thead/tr/th[@col='3nzf']/a")
jsn.click()
time.sleep(5)
urls.append(self.driver.current_url)
return urls
def saveurls(delf,self,urls):
# 连接数据库
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
# 得到一个可以执行SQL语句的光标对象
cur = conn.cursor()
# 对数据进行操作
# 定义一个列表,列表中含多个元组,等会批量插入每个元组中的数据
cur.executemany('insert into t_funds_url (code,type) values(%s,%s)', urls) # 批量插入数据
# 提交请求,否则数据是不会写入数据库里
conn.commit()
#关闭数据库连接
cur.close()
conn.close()
def deldata():
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cur = conn.cursor()
cur.execute('delete from t_funds_url')
conn.commit()
cur.close()
conn.close()
if __name__ == '__main__':
startUrl = 'http://fund.eastmoney.com/data/fundranking.html#tgp;c0;r;s3nzf;pn50;ddesc;qsd20190528;qed20200528;qdii;zq;gg;gzbd;gzfs;bbzt;sfbb'
urls = fundSpider(startUrl,None).getUrl()
print("获取链接完成...")
deldata()
print("删除数据完成...")
for i in range(1,len(urls) + 1):
fundSpider(urls[i-1],i).run()
2.提取详情数据:
from lxml import etree
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pymysql
import threading
class fundSpider(threading.Thread):
def __init__(self,urls):
threading.Thread.__init__(self)
self.urls = urls
# 无界面模式
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
def run(self):
for url in self.urls:
print(url)
code = url[url.find('_')+1:-5]
self.request_detail_page(self, url, code)
def request_detail_page(delf, self, url,code):
self.driver.get(url)
print(url)
while True:
time.sleep(5)
source = self.driver.page_source
html = etree.HTML(source)
# 先得到body下所有的tr
trList = html.xpath("//table[@class='w782 comm lsjz']/tbody/tr")
vals = []
for tr in trList:
lsval = tr.xpath("./td/text()")[1].strip()
t = tr.xpath("./td/text()")[0].strip()
vals.append((code,lsval,t,2))
self.savedata(self, vals)
# 将按钮显示等待
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.XPATH, "//div[@id='pagebar']/div[@class='pagebtns']/label[last()]"))
)
# 先定位到按钮
next_btn = self.driver.find_element_by_xpath("//div[@id='pagebar']/div[@class='pagebtns']/label[last()]")
# 按钮置灰说没有下一页
if "end" in next_btn.get_attribute("class"):
break
else:
# 点击下一页
self.driver.execute_script("arguments[0].click();", next_btn)
def savedata(delf,self,vals):
# 连接数据库
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
# 得到一个可以执行SQL语句的光标对象
cur = conn.cursor()
# 对数据进行操作
# 定义一个列表,列表中含多个元组,等会批量插入每个元组中的数据
cur.executemany('insert into t_hh_funds_info (code,jingzhi,time,type) values(%s,%s,%s,%s)', vals) # 批量插入数据
# 提交请求,否则数据是不会写入数据库里
conn.commit()
#关闭数据库连接
cur.close()
conn.close()
def geturls():
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cursor = conn.cursor()
sql = "SELECT * FROM t_funds_url"
try:
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
urls = []
for it in results:
urls.append("http://fundf10.eastmoney.com/jjjz_" + it[1] + ".html")
except:
print("Error: unable to fecth data")
# 关闭数据库连接
cursor.close()
conn.close()
return urls
if __name__ == '__main__':
threadList = []
urls = geturls()
num = 1
jifen = 10
total = len(urls)
qm = total % jifen
start = num - 1
every = total // jifen
for i in range(num, jifen + num):
every = total // jifen # 每一份有多少
if qm == num - 1: # 整除
if i == jifen: # 最后一次循环
c = urls[start:start + every + num]
else:
c = urls[start:start + every]
else:
if i == jifen:
c = urls[start:start + every + qm + num]
else:
c = urls[start:start + every]
spider = fundSpider(c)
start += every
threadList.append(spider)
for t in threadList:
t.setDaemon(True) # 设置为守护线程,不会因主线程结束而中断
t.start()
for t in threadList:
t.join() # 子线程全部加入,主线程等所有子线程运行完毕
print("主线程结束")
3.插入最新数据;
from lxml import etree
from selenium import webdriver
import time
import pymysql
import threading
class fundSpider(threading.Thread):
def __init__(self,codes):
threading.Thread.__init__(self)
self.yuming = 'http://fund.eastmoney.com/'
self.sub = '.html'
self.codes = codes
# 无界面模式
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
def run(self):
codes = self.codes
shijivallist = []
for code in codes:
url = self.yuming + code + self.sub
print(url)
shijival = self.getshijival(url,code,1)
shijivallist.append(shijival)
#插入数据
self.savedata(self,shijivallist)
def getshijival(self,url,code,type):
self.driver.get(url)
time.sleep(4)
source = self.driver.page_source
html = etree.HTML(source)
#获取实际净值
shijival = html.xpath("//dl[@class='dataItem02']/dd[@class='dataNums']/span/text()")[0]
# 时间净值
t = html.xpath("//dl[@class='dataItem02']/dt/p/text()")[0]
t = t[:-1]
# 基金规模
trList = html.xpath("//div[@class='infoOfFund']/table/tbody/tr")
guimo = trList[0].xpath("./td")[1].xpath("./text()")[0]
# 成立日期
clTime = trList[1].xpath("./td/text()")[0]
# 基金公司
company = trList[1].xpath("./td")[1].xpath("./a/text()")[0]
# 基金名称
fundName = html.xpath("//div[@class='fundDetail-tit']/div/text()")[0]
return (code,shijival,t,type,company,clTime,guimo,fundName)
def savedata(delf,self,vals):
print("插入数据")
# 连接数据库
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
# 得到一个可以执行SQL语句的光标对象
cur = conn.cursor()
# 对数据进行操作
# 定义一个列表,列表中含多个元组,等会批量插入每个元组中的数据
cur.executemany('insert into t_funds_info_copy (code,jingzhi,time,type,company,cl_time,guimo,fund_name) values(%s,%s,%s,%s,%s,%s,%s,%s)', vals) # 批量插入数据
# 提交请求,否则数据是不会写入数据库里
conn.commit()
# 关闭数据库连接
cur.close()
conn.close()
def getcode():
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cursor = conn.cursor()
sql = "SELECT code FROM t_funds_info group by code"
try:
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
codes = []
for it in results:
codes.append(it[0])
except:
print("Error: unable to fecth data")
# 关闭数据库连接
cursor.close()
conn.close()
return codes
if __name__ == '__main__':
threadList = []
codes = getcode()
num = 1
jifen = 15
total = len(codes)
qm = total % jifen
start = num - 1
every = total // jifen
for i in range(num, jifen + num):
every = total // jifen # 每一份有多少
if qm == num - 1: # 整除
if i == jifen: # 最后一次循环
c = codes[start:start + every + num]
else:
c = codes[start:start + every]
else:
if i == jifen:
c = codes[start:start + every + qm + num]
else:
c = codes[start:start + every]
spider = fundSpider(c)
start += every
threadList.append(spider)
for t in threadList:
t.setDaemon(True) # 设置为守护线程,不会因主线程结束而中断
t.start()
for t in threadList:
t.join() # 子线程全部加入,主线程等所有子线程运行完毕
print("主线程结束")
4.计算成绩:
from lxml import etree
from selenium import webdriver
import time
import pymysql
import threading
class fundSpider(threading.Thread):
def __init__(self,codes,type):
threading.Thread.__init__(self)
self.yuming = 'http://fund.eastmoney.com/'
self.sub = '.html'
self.codes = codes
self.type = type
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
def run(self):
codes = self.codes
orderbys = []
for code in codes:
url = self.yuming + code + self.sub
print(url)
jingzhigusuan = self.getjingzhigusuan(url)
#查询所有历史净值并且去重
vals = self.gethistoryval(code)
vals.append(jingzhigusuan)
#去重
vals = list(set(vals))
#默认升序
vals.sort()
# 查询基金基本信息
info = self.getfundinfo(code)
#把所有的成绩放到列表里面
orderbys.append((vals.index(jingzhigusuan)/len(vals),code,jingzhigusuan,info[0],info[1],info[2],info[3]))
# 排序成绩 按照第一个数据排序
new_data = sorted(orderbys)
# 取最小的成绩
for new in new_data:
print(new)
print(str(self.type) * 99)
self.driver.quit() #关闭窗口
def getjingzhigusuan(self,url):
self.driver.get(url)
time.sleep(4)
source = self.driver.page_source
html = etree.HTML(source)
#获取估算净值
jingzhigusuan = html.xpath("//dl[@class='dataItem01']//dl[@class='floatleft']/span/text()")[0]
return jingzhigusuan
def gethistoryval(self,code):
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cursor = conn.cursor()
sql = " select DISTINCT jingzhi from t_funds_info where code = %s "
try:
# 执行SQL语句
cursor.execute(sql,code)
# 获取所有记录列表
results = cursor.fetchall()
vals = []
for it in results:
vals.append(it[0])
except:
print("Error: unable to fecth data")
# 关闭数据库连接
cursor.close()
conn.close()
return vals
def getfundinfo(self,code):
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cursor = conn.cursor()
sql = "SELECT company,cl_time,guimo,fund_name FROM t_funds_info where code = %s limit 1"
try:
# 执行SQL语句
cursor.execute(sql,code)
# 获取所有记录列表
results = cursor.fetchall()
info = []
for it in results:
info.append(it[0])
info.append(it[1])
info.append(it[2])
info.append(it[3])
except:
print("Error: unable to fecth data")
# 关闭数据库连接
cursor.close()
conn.close()
return info
def getcode(type):
conn = pymysql.connect(host="", user="", password="", database="test_s", charset="utf8")
cursor = conn.cursor()
sql = "SELECT code FROM t_funds_url where type = %s "
try:
# 执行SQL语句
cursor.execute(sql,type)
# 获取所有记录列表
results = cursor.fetchall()
codes = []
for it in results:
codes.append(it[0])
except:
print("Error: unable to fecth data")
# 关闭数据库连接
cursor.close()
conn.close()
return codes
if __name__ == '__main__':
threadList = []
for i in range(1,7):
codes = getcode(i)
threadList.append(fundSpider(codes,i))
for t in threadList:
t.setDaemon(True) # 设置为守护线程,不会因主线程结束而中断
t.start()
for t in threadList:
t.join() # 子线程全部加入,主线程等所有子线程运行完毕
print("主线程结束")
数据库脚本:
CREATE TABLE `t_funds_info` (
`id` bigint(255) unsigned NOT NULL AUTO_INCREMENT,
`code` varchar(20) DEFAULT NULL,
`jingzhi` varchar(20) DEFAULT NULL,
`time` varchar(20) DEFAULT NULL,
`type` varchar(20) DEFAULT NULL COMMENT '基金类型',
`company` varchar(100) DEFAULT NULL,
`cl_time` varchar(20) DEFAULT NULL,
`guimo` varchar(100) DEFAULT NULL,
`fund_name` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;
CREATE TABLE `t_funds_info_copy` (
`id` bigint(255) unsigned NOT NULL AUTO_INCREMENT,
`code` varchar(20) DEFAULT NULL,
`jingzhi` varchar(20) DEFAULT NULL,
`time` varchar(20) DEFAULT NULL,
`type` varchar(20) DEFAULT NULL COMMENT '基金类型',
`company` varchar(100) DEFAULT NULL,
`cl_time` varchar(20) DEFAULT NULL,
`guimo` varchar(100) DEFAULT NULL,
`fund_name` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;
CREATE TABLE `t_funds_url` (
`id` bigint(255) unsigned NOT NULL AUTO_INCREMENT,
`code` varchar(100) DEFAULT NULL,
`type` int(10) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;