爬虫之旅（三）-CSDN博客

本文链接：https://blog.csdn.net/hensonwells/article/details/76037745

!/usr/bin/python
#encoding=utf-8
__author__ = 'Administrator'
from  bs4 import  BeautifulSoup
import selenium
import sys
import urllib
import requests
import time
import re

if __name__ == "__main__":
    import os
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

    chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.get('http://lib.cqvip.com/zk/search.aspx')
    #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
    inputElement = driver.find_element_by_name("b_Text0")
    #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")

    searchWord="大气"
    inputElement.send_keys((searchWord))
    driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()

    currentURL=driver.current_url
    urlList=[]
    localDir = '/home/henson/Downloads/paper'
     #基金抓取
     #version 1
    #fund=driver.find_element_by_xpath("//*[@id='result_divlist']/dl/dd[4]")
   # fund=driver.find_element_by_class_name("fund")
   # print(fund.text)

    req = urllib.request.Request(url=currentURL, headers=headers)
    html = urllib.request.urlopen(req)
    #version 2
    soup = BeautifulSoup(html, "html.parser")
    div=soup.find_all('div')
    for aa in div:
        dl =aa.find_all('dl')
        for bb in dl:
            dd=bb.find_all('dd')
            print(dd)

    #version 3
    #for dl in  soup.find_all('')
    #div=soup.find_all('div')
    #dl=div.find_all('dl')
    #for abc in dl:
     #   dd = abc.find_all('dd')
     #   print(dd)


     #version 4
    for dl in soup.find_all('dl'):
        for dd in dl.find_all('dd'):
            fund=dd.find_all(class_="fund")
            print(fund)
     #version 5
    for dl in soup.find_all('dl'):
        for dd in  dl.find_all('dd'):
            print(dd.text)
            print(soup.dd)
            fund = driver.find_element_by_class_name("fund")
            print(fund.text)
    #version 6
    dl=soup.find_all('dl')
    for dd in  dl:
        fund = driver.find_element_by_class_name("fund")
        print(fund.text)
        print(dd.string)
    #for tag in soup.find_all(re.compile("^dd")):
    #    print(tag)
     #version 7
    #fund=soup.find_all("dd",class_="fund")
    #print(fund)



    r=requests.get(currentURL)
    print(currentURL)

    #rr = urllib.request.urlopen(currentURL)
    data=r.text
    #print(data)
   # link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)

   # link_list=re.findall(r"a href=./download/ target=\"_blank\" ",data)
    #pattern = re.compile(r"<a href=.* target=\"_blank\" onclick=.*\s?.*<img src=.*\.jpg\" alt=.*title=\"\".*height=")
   # res_list=pattern.findall(rr)
    #for url in link_list:
     #   print(url)


    #跳转问题
    driver.find_element_by_xpath("//*[@id='result_divlist']/dl[2]/dt/span/a[2]").click()
    #driver.current_window_handle
    driver.window_handles
    #time.sleep(60)
    currentURL = driver.current_url
    urlList = []
    r = requests.get(currentURL)
    print(currentURL)

Q1:写了7个版本的抓取本文，还是未能实现遍历出所有的dd中的class=”fund”的内容，总感觉自己思路没毛病，就是出不来2333，而且还参考了很多别人的代码，还是无力回天
Q2：跳转问题那里，正常情况.click()之后，currentURL = driver.current_url获取的应该是跳转之后的URL，为什么拿到的还是跳转之前的，1.看别人的解决方案似乎得让它sleep延迟一下再获取，但是没有效果。
2.看了API也有给它加handle的句柄，然而也是没有奏效。