!/usr/bin/python
#encoding=utf-8
__author__ = 'Administrator'
from bs4 import BeautifulSoup
import selenium
import sys
import urllib
import requests
import time
import re
if __name__ == "__main__":
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.get('http://lib.cqvip.com/zk/search.aspx')
#inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
inputElement = driver.find_element_by_name("b_Text0")
#inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")
searchWord="大气"
inputElement.send_keys((searchWord))
driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()
currentURL=driver.current_url
urlList=[]
localDir = '/home/henson/Downloads/paper'
#基金抓取
#version 1
#fund=driver.find_element_by_xpath("//*[@id='result_divlist']/dl/dd[4]")
# fund=driver.find_element_by_class_name("fund")
# print(fund.text)
req = urllib.request.Request(url=currentURL, headers=headers)
html = urllib.request.urlopen(req)
#version 2
soup = BeautifulSoup(html, "html.parser")
div=soup.find_all('div')
for aa in div:
dl =aa.find_all('dl')
for bb in dl:
dd=bb.find_all('dd')
print(dd)
#version 3
#for dl in soup.find_all('')
#div=soup.find_all('div')
#dl=div.find_all('dl')
#for abc in dl:
# dd = abc.find_all('dd')
# print(dd)
#version 4
for dl in soup.find_all('dl'):
for dd in dl.find_all('dd'):
fund=dd.find_all(class_="fund")
print(fund)
#version 5
for dl in soup.find_all('dl'):
for dd in dl.find_all('dd'):
print(dd.text)
print(soup.dd)
fund = driver.find_element_by_class_name("fund")
print(fund.text)
#version 6
dl=soup.find_all('dl')
for dd in dl:
fund = driver.find_element_by_class_name("fund")
print(fund.text)
print(dd.string)
#for tag in soup.find_all(re.compile("^dd")):
# print(tag)
#version 7
#fund=soup.find_all("dd",class_="fund")
#print(fund)
r=requests.get(currentURL)
print(currentURL)
#rr = urllib.request.urlopen(currentURL)
data=r.text
#print(data)
# link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
# link_list=re.findall(r"a href=./download/ target=\"_blank\" ",data)
#pattern = re.compile(r"<a href=.* target=\"_blank\" onclick=.*\s?.*<img src=.*\.jpg\" alt=.*title=\"\".*height=")
# res_list=pattern.findall(rr)
#for url in link_list:
# print(url)
#跳转问题
driver.find_element_by_xpath("//*[@id='result_divlist']/dl[2]/dt/span/a[2]").click()
#driver.current_window_handle
driver.window_handles
#time.sleep(60)
currentURL = driver.current_url
urlList = []
r = requests.get(currentURL)
print(currentURL)
Q1:写了7个版本的抓取本文,还是未能实现遍历出所有的dd中的class=”fund”的内容,总感觉自己思路没毛病,就是出不来2333,而且还参考了很多别人的代码,还是无力回天
Q2:跳转问题那里,正常情况.click()之后,currentURL = driver.current_url获取的应该是跳转之后的URL,为什么拿到的还是跳转之前的,1.看别人的解决方案似乎得让它sleep延迟一下再获取,但是没有效果。
2.看了API也有给它加handle的句柄,然而也是没有奏效。