写在前面:本文只做技术分享和学习,并非以此牟利,如有侵犯,请联系删除。
因为需要模拟浏览器登录和执行js,所以本文用到了浏览器驱动,用的是谷歌浏览器驱动(chromedriver),当然也可以用火狐等等浏览器以及其驱动。
谷歌驱动环境搭建(点击我)
好了。直接上代码,如果你的计算机python环境中缺失哪些库,记得pip一下哦
import sys
from lxml import etree
import time
import requests
import csv
from selenium import webdriver # 导入webdriver模块
from bs4 import BeautifulSoup
from time import sleep
class ChengxingspiderItem:
def __init__(self):
self.fund_code = ""
self.fund_name = ""
self._i = 0
def __iter__(self):
return self
def __next__(self):
if self._i == 0:
self._i += 1
return self.fund_code
elif self._i == 1:
self._i += 1
return self.fund_name
else:
raise StopIteration()
def FormatHtml(_html,rows):
fund_xpath = etree.HTML(_html)
trs = fund_xpath.xpath('//*[@id="ctl00_cphMain_gridResult"]/tbody/tr')
for i in range(1,len(trs)):
item = ChengxingspiderItem()
fund_code = fund_xpath.xpath(f'//*[@id="ctl00_cphMain_gridResult"]/tbody/tr[{i+1}]/td[3]/a/text()')
fund_name = fund_xpath.xpath(f'//*[@id="ctl00_cphMain_gridResult"]/tbody/tr[{i+1}]/td[4]/a/text()')
item.fund_name = fund_name[len(fund_name)-1]
item.fund_code = fund_code[len(fund_code)-1]
rows.append(list(item))
return rows
if __name__ == '__main__':
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
# 我的驱动位置放置于C:/Program Files/Google/Chrome/Application/chromedriver.exe,可以根据你的而定
driver = webdriver.Chrome('C:/Program Files/Google/Chrome/Application/chromedriver.exe', chrome_options=options)
_url = "http://cn.morningstar.com/quickrank/default.aspx"
driver.get(_url)
username = driver.find_element_by_id('emailTxt')
password = driver.find_element_by_id('pwdValue')
# 账号密码
username.send_keys('这是你的账号')
password.send_keys('这是你的密码')
submit = driver.find_element_by_id('loginGo')
a = submit.text
submit.click()
# 睡眠时间根据所在网络而定
sleep(3)
# 添加筛选
driver.find_element_by_id('ctl00_cphMain_cblStarRating_0').click()
driver.find_element_by_id('ctl00_cphMain_cblStarRating5_0').click()
driver.find_element_by_id('ctl00_cphMain_cblGroup_0').click()
driver.find_element_by_id('ctl00_cphMain_cblCategory_0').click()
# 搜索
driver.find_element_by_id('ctl00_cphMain_btnGo').click()
countText = driver.find_element_by_id('ctl00_cphMain_TotalResultLabel').text
rows = []
count = int(countText)//25+1
for i in range(count):
js ="__doPostBack('ctl00$cphMain$AspNetPager1','"+str(i)+"')"
driver.execute_script(js)
_html = driver.page_source
rows = FormatHtml(_html,rows)
print(rows)
headers = ['fund_code', 'fund_name']
# 此文件同路径下新建一个test.csv文件
with open('test.csv', 'w', newline='')as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(rows)