爬虫之旅(一)

53 篇文章 1 订阅
18 篇文章 0 订阅
#!/usr/bin/python
#encoding=utf-8
__author__ = 'Administrator'

import selenium
import sys
import urllib
import requests
import re

if __name__ == "__main__":
    import os
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait

    chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.get('http://lib.cqvip.com/zk/search.aspx')
    #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
    inputElement = driver.find_element_by_name("b_Text0")
    #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")

    searchWord="大气"
    inputElement.send_keys((searchWord))
    driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()
    currentURL=driver.current_url
    urlList=[]
    localDir = '/home/henson/Downloads/paper'

    r=requests.get(currentURL)

    #rr = urllib.request.urlopen(currentURL)
    data=r.text
    #print(data)
   # link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)

   # link_list=re.findall(r"a href=./download/ target=\"_blank\" ",data)
    #pattern = re.compile(r"<a href=.* target=\"_blank\" onclick=.*\s?.*<img src=.*\.jpg\" alt=.*title=\"\".*height=")
   # res_list=pattern.findall(rr)
    #for url in link_list:
     #   print(url)



    driver.find_element_by_xpath("//*[@id='46744583']").click()
   # re.findall(r"(?<=href=\/ download \/ confirm.aspx\?)",data).click()
    link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)


    currentURL = driver.current_url
    r = requests.get(currentURL)

    data = r.text
    #driver.find_element_by_class_name("op").click()
    currentURL = driver.current_url
    r = requests.get(currentURL)
 #   pattern = re.compile(r"/html/body/div/")
  #  link_list=re.findall(pattern,data)
    print(data)
    #driver.find_element_by_xpath("/html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]]").click()
    driver.find_element_by_class_name("btns_a down").click()
    data = r.text
    link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
  #  link_list = re.findall('<a href="(.*)" /html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]target="_blank"', data)


 #   print(data)

   # Element = driver.find_elements_by_class("btnTitle")
    #LINK_PATTERN =  re.findall(r'<a href="(http:\/\/.*)" class="down_link">',data)

    #LINK_PATTERN = re.findall(r'<li><a href="(http:\/\/.*)" </a></li>',data)

   # LINK_PATTERN = '<a href="(/ download / confirm.aspx?.*)" target="_blank">'
    #url_list = list(set(re.findall(LINK_PATTERN, data)))
    #url_list = re.findall(LINK_PATTERN, data)

    for url in link_list:

        print(url)


Q1:怎么匹配href里的字段,有些相同是element似乎只能通过正则来爬取了
Q2:按钮触发的链接怎么爬取,按照静态的爬取只能抓取到页面上能看到的链接,而子节点的链接去无法抓取
Q3:怎么抓取子节点的东西
Q4:为什么 from bs4 import BeautifulSoup失败

ps:chrome driver 确实很好用
以上就是这两天来的学习困惑,待解决#

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值