对python 不熟悉,想用爬虫框架,看到一堆的安装东西,没那多的时间,看到selenium 好像可以操作浏览器,不用写这个麻烦。想不到这个还是很麻烦。电脑上用qq登陆,代码自动进行网页登陆,查找信息。其中有405问题
# -*- coding: utf-8 -*-
from os import system, times
from w3lib.html import remove_tags
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
class Chacha:
links=[]
driver=object
url=''
def __init__(self,url):
self.url=url
optionsdata = webdriver.FirefoxOptions()
#optionsdata.add_argument('-headless')
self.driver = webdriver.Firefox(options=optionsdata)
# 获取当前标签页1的句柄
handle = self.driver.current_window_handle
print('tab1: ' + handle)
def get_text(self,url,n):
# 新开一个标签页2
self.driver.execute_script('window.open("","_blank");')
# 切换到标签页1
self.driver.switch_to.window(self.driver.window_handles[n])
#self.driver.switch_to.window('tab')
self.driver.get(url)
try:
# title=WebDriverWait(self.driver,10).until(
# EC.presence_of_element_located(By.CSS_SELECTOR,".co")
# )
element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".f.ca > .val > span > span > a"))
)
#未登陆时使用
# element = WebDriverWait(self.driver, 10).until(
# EC.presence_of_element_located((By.CSS_SELECTOR, ".fc > .text-primary > .text-primary > a"))
# )
navHead=self.driver.find_element(By.CSS_SELECTOR, ".nav-head")
navHead.find_element_by_partial_link_text('基本信息').click()
name=element.text
# 电话
phone=self.driver.find_element(By.CSS_SELECTOR, ".val > span:nth-child(2)").text
self.driver.find_element(By.CSS_SELECTOR, ".active > h2").click()
#tel=self.driver.find_element(By.CSS_SELECTOR, "span.f.ca > span.val > sapn:nth-child(2)").text
#未登陆时
#tel=self.driver.find_element(By.CSS_SELECTOR, "span.fc > span.cvlu span > a").text
# 企业名称
company_name=self.driver.find_element(By.CSS_SELECTOR, ".ntable > tr:nth-child(1) > td:nth-child(4)").text
# 最新年报地址
zhuche =remove_tags(self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(9) > td:nth-child(2)").text)
if(zhuche==''):
zhuche=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(9) > td:nth-child(2)> a.text-dk").text
add=""
Business_Scope=""
temp=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > .tb").text
if(temp=="最新年报地址"):
add=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > td:nth-child(2)").text
# 经营范围
Business_Scope=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(11) > td:nth-child(2)").text
if(temp=="经营范围"):
Business_Scope=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > td:nth-child(2)").text
print(name,phone,company_name,zhuche,add,Business_Scope)
except TimeoutError:
self.driver.close()
finally:
print(self.driver.title)
def start_login(self):
self.driver.get(self.url)
for handle in self.driver.window_handles:
print(handle)
try:
element = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "li:nth-child(10) > .navi-btn"))
)
element.click()
except(ValueError, ArithmeticError):
print("程序发生了数字格式异常、算术异常之一")
finally:
print(self.driver.title)
#self.driver.find_element(By.CSS_SELECTOR, "li:nth-child(10) > .navi-btn > span").click()
self.driver.find_element(By.CSS_SELECTOR, ".text-center > .btn-qq-d").click()
# 5 | selectFrame | index=0 |
self.driver.switch_to.frame(0)
self.driver.find_element(By.XPATH, "//span[4]").click()
print(self.driver.title)
self.driver.switch_to.default_content()
#self.get_links(self)
def get_links(self):
# 关键字搜索,把链接存到links列表
try:
elementw = WebDriverWait(self.driver,10).until(
EC.presence_of_element_located((By.ID, "searchkey"))
)
print("搜索准备完成!")
finally:
print("----------")
elementw.send_keys("软件")
self.driver.find_element(By.CSS_SELECTOR, ".index-searchbtn").click()
# 2 | setWindowSize | 1550x838 |
self.driver.set_window_size(1550, 838)
try:
element2 = WebDriverWait(self.driver,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".ntable.ntable-list"))
)
finally:
print(222)
# 4 | click | css=tr:nth-child(11) > td:nth-child(2) |
list=self.driver.find_elements(By.CSS_SELECTOR, "tr td .maininfo > a.title")
for li in list:
print(li.get_attribute("href"))
self.links.append(li.get_attribute("href"))
cc=Chacha("https://www.qcc.com")
cc.start_login()
print(cc)
cc.get_links()
n=1
while len(cc.links) > 0:
link=cc.links.pop(0)
cc.get_text(link,n)
n=n+1