#!/usr/bin/python
#encoding=utf-8
__author__ = 'Administrator'
import selenium
import sys
import urllib
import requests
import re
if __name__ == "__main__":
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.get('http://lib.cqvip.com/zk/search.aspx')
#inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
inputElement = driver.find_element_by_name("b_Text0")
#inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")
searchWord="大气"
inputElement.send_keys((searchWord))
driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()
currentURL=driver.current_url
urlList=[]
localDir = '/home/henson/Downloads/paper'
r=requests.get(currentURL)
#rr = urllib.request.urlopen(currentURL)
data=r.text
#print(data)
# link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
# link_list=re.findall(r"a href=./download/ target=\"_blank\" ",data)
#pattern = re.compile(r"<a href=.* target=\"_blank\" onclick=.*\s?.*<img src=.*\.jpg\" alt=.*title=\"\".*height=")
# res_list=pattern.findall(rr)
#for url in link_list:
# print(url)
driver.find_element_by_xpath("//*[@id='46744583']").click()
# re.findall(r"(?<=href=\/ download \/ confirm.aspx\?)",data).click()
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
currentURL = driver.current_url
r = requests.get(currentURL)
data = r.text
#driver.find_element_by_class_name("op").click()
currentURL = driver.current_url
r = requests.get(currentURL)
# pattern = re.compile(r"/html/body/div/")
# link_list=re.findall(pattern,data)
print(data)
#driver.find_element_by_xpath("/html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]]").click()
driver.find_element_by_class_name("btns_a down").click()
data = r.text
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
# link_list = re.findall('<a href="(.*)" /html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]target="_blank"', data)
# print(data)
# Element = driver.find_elements_by_class("btnTitle")
#LINK_PATTERN = re.findall(r'<a href="(http:\/\/.*)" class="down_link">',data)
#LINK_PATTERN = re.findall(r'<li><a href="(http:\/\/.*)" </a></li>',data)
# LINK_PATTERN = '<a href="(/ download / confirm.aspx?.*)" target="_blank">'
#url_list = list(set(re.findall(LINK_PATTERN, data)))
#url_list = re.findall(LINK_PATTERN, data)
for url in link_list:
print(url)
Q1:怎么匹配href里的字段,有些相同是element似乎只能通过正则来爬取了
Q2:按钮触发的链接怎么爬取,按照静态的爬取只能抓取到页面上能看到的链接,而子节点的链接去无法抓取
Q3:怎么抓取子节点的东西
Q4:为什么 from bs4 import BeautifulSoup失败
ps:chrome driver 确实很好用
以上就是这两天来的学习困惑,待解决#