爬虫这行运到一点加密问题就难了一b,加密处理要么破解,要么模仿人为操作,下面给大家分享下简单的selenium的一个小项目进入网站首页 会出现这类的提示,如果用平常不能得到想要的数据文本,要模仿人为操作点击确定按钮,

最终获取是文本中企业数据内容

直接上代码 代码中有time间隔睡眠等待数据加载 其实最好用WebDriverWait来等待加载(自己写的小demo 这些东西了解下流程)
from selenium import webdriver
import time
import random
import csv
import codecs
# 企业数据首页的加载
url = "http://42.123.101.210:8088/gzzhxt/"
browser = webdriver.Chrome()
browser.maximize_window() # 浏览器窗口最大化
browser.get(url)
time.sleep(2)
search_btn = browser.find_element_by_css_selector('#aBtndiv_alert_form_')
search_btn.click() # 点击确定按钮
time.sleep(random.random()*3)
qiye_Data = browser.find_element_by_css_selector('#enterprise_tab')
qiye_Data.click() # 点击企业数据
time.sleep(random.random()*4)
for page in range(5): # 保存前五页数据
print("正在爬取{0}".format(page+1)+"页数据")
browser.execute_script('window.scrollTo(0,document.body.scrollHeight);') # 滚动到最后
time.sleep(random.random()*3+1)
ls = browser.find_elements_by_xpath('//div[@id="divLstGrid__weblistenterpriseLib_Box__"]/table/tbody//tr')
print('len:',len(ls))
for each in ls:
datas = []
num = each.find_element_by_xpath("./td[2]/div").get_attribute("innerText")
name = each.find_element_by_xpath("./td[3]/div").get_attribute("innerText")
daibiao = each.find_element_by_xpath("./td[4]/div").get_attribute("innerText") if len(each.find_element_by_xpath("./td[4]/div").get_attribute("innerText"))> 0 else None
data = each.find_element_by_xpath("./td[5]/div").get_attribute("innerText") if len(each.find_element_by_xpath("./td[5]/div").get_attribute("innerText"))>0 else None
adress_city = each.find_element_by_xpath("./td[6]/div").get_attribute("innerText") if len(each.find_element_by_xpath("./td[6]/div").get_attribute("innerText"))> 0 else None
adress_province =each.find_element_by_xpath("./td[7]/div").get_attribute("innerText") if len(each.find_element_by_xpath("./td[7]/div").get_attribute("innerText"))>0 else None
datas.append(num)
datas.append(name)
datas.append(data)
datas.append(adress_city)
datas.append(adress_province)
print(num,name,daibiao,data,adress_city,adress_province)
with codecs.open("./datas_ls.csv","a",encoding="utf-8") as file:
wr = csv.writer(file)
wr.writerow(datas)
time.sleep(random.random())
# 翻页
time.sleep(random.random()*3)
next_page_btn = browser.find_elements_by_css_selector('#weblistenterpriseLib_Box_btnNextPage')
if len(next_page_btn)>0:
next_page_btn = next_page_btn[0]
print('next page....')
next_page_btn.click()
为了数据更好的保存,把数据csv转化为excle(之前博客有这类的文章)
