为了练手熟悉一下python爬虫工具,写了个小例子。主要逻辑如下:
1、在基金分类页获取所有基金基础信息
2、根据第一步获取的信息进行循环,在每个基金页面进行爬取每天的记录值
代码:
获取不同基金代码,并存入本地数据库:getFoundCode.py
import requests
import demjson
import sqlite3
import uuid
def get_page(url, page_num):
pageList = []
for i in range(1, page_num + 1):
formdata = {'op': 'ph',
'dt': 'kf',
'ft': 'zs',
'rs': '',
'gs': 0,
'sc': 'zzf',
'st': 'desc',
'sd': '2018 - 11 - 11',
'ed': '2019 - 11 - 11',
'qdii': '',
'tabSubtype': ', , , , ,',
'pi': i,
'pn': 50,
'dx': 1,
'v': 0.6213036119243205}
try:
r = requests.get(url, params=formdata)
r.raise_for_status()
print('链接成功')
str = r.text
str = str[15:-1]
b = demjson.decode(str)
sqlCommand = "insert into found_code (foundId, foundName, foundCode) values "
for item in b['datas']:
rowSql = "('{}','{}','{}'),".format(uuid.uuid1(), item.split(',')[1], item.split(',')[0])
sqlCommand += rowSql
sqlCommand = sqlCommand[0:-1]
cursor.execute(sqlCommand)
except:
print('链接失败')
#初始化数据库连接
conn = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db', check_same_thread=False)
cursor = conn.cursor()
url = 'http://fund.eastmoney.com/data/rankhandler.aspx'
pageList = get_page(url, 17)
conn.commit()
conn.close()
然后主程序中通过mainSpider()方法,循环生成不同基金的地址
在beginSpider(code)方法中,调用initSpider(fundcode)方法获取页面操作对象driver和总页数
调用getData方法,循环拉取数据并插入本地数据库
# coding: utf-8
import time
import uuid
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from bs4 import BeautifulSoup
from threading import Thread, Lock
import os
import csv
import sqlite3
# 下面是利用 selenium 抓取html页面的代码
# 初始化函数
def initSpider(fundcode):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("http://fund.eastmoney.com/f10/jjjz_{0}.html".format(fundcode)) # 要抓取的网页地址
# 找到"下一页"按钮,就可以得到它前面的一个label,就是总页数
getPage_text = driver.find_element_by_id("pagebar").find_element_by_xpath(
"div[@class='pagebtns']/label[text()='下一页']/preceding-sibling::label[1]").get_attribute("innerHTML")
# 得到总共有多少页
total_page = int("".join(filter(str.isdigit, getPage_text)))
# 返回
return (driver, total_page)
# 获取html内容
def getData(myrange, driver, lock, code):
for x in myrange:
# 锁住
lock.acquire()
# 初始化数据库连接
conn = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db')
cursor = conn.cursor()
attempts = 0
success = False
tonum = driver.find_element_by_id("pagebar").find_element_by_xpath(
"div[@class='pagebtns']/input[@class='pnum']") # 得到 页码文本框
jumpbtn = driver.find_element_by_id("pagebar").find_element_by_xpath(
"div[@class='pagebtns']/input[@class='pgo']") # 跳转到按钮
tonum.clear() # 第x页 输入框
tonum.send_keys(str(x)) # 去第x页
jumpbtn.click() # 点击按钮
# 抓取
WebDriverWait(driver, 20).until(lambda driver: driver.find_element_by_id("pagebar").find_element_by_xpath(
"div[@class='pagebtns']/label[@value={0} and @class='cur']".format(x)) != None)
# 保存到项目中
datalist = []
while attempts < 3 and not success:
try:
rows = driver.find_element_by_id("jztable").find_elements(By.TAG_NAME, "tr")
for row in rows:
para = {}
table_td_list = row.find_elements(By.TAG_NAME, "td")
if len(table_td_list) > 0:
para['id'] = uuid.uuid1()
para['foundCode'] = code
para['date'] = table_td_list[0].text
para['netWorth'] = table_td_list[1].text
para['growthRate'] = table_td_list[3].text[0:-1]
datalist.append(para)
# 存储数据
sqlCommand = "insert into found_netWorth_detail (id, foundCode, date, netWorth, growthRate) values "
for item in datalist:
rowSql = "('{}','{}','{}','{}','{}'),".format(item['id'], item['foundCode'], item['date'],
item['netWorth'],
item['growthRate'])
sqlCommand += rowSql
sqlCommand = sqlCommand[0:-1]
print(sqlCommand)
cursor.execute(sqlCommand)
cursor.execute("UPDATE found_code set isGet='yes' where foundCode=="+code)
conn.commit()
conn.close()
print("保存完成")
# 解锁
lock.release()
success = True
except:
attempts += 1
time.sleep(1)
if attempts == 3:
os.exits(0)
# 开始抓取函数
def beginSpider(code):
# 初始化爬虫
(driver, total_page) = initSpider(code)
# 创建锁
lock = Lock()
r = range(1, int(total_page) + 1)
step = 10
range_list = [r[x:x + step] for x in range(0, len(r), step)] # 把页码分段
thread_list = []
for r in range_list:
t = Thread(target=getData, args=(r, driver, lock, code))
thread_list.append(t)
t.start()
for t in thread_list:
t.join() # 这一步是需要的,等待线程全部执行完成
def mainSpider():
conn1 = sqlite3.connect(r'C:\AWork\python\workspace\supervised-reptile\found.db')
cursor1 = conn1.cursor()
cursor1.execute("SELECT foundCode from found_code where isGet='no'")
cd = cursor1.fetchall()
for (row,) in cd:
beginSpider(row)
conn1.commit()
conn1.close()
print("抓取完成")
mainSpider()