公司内部的网站 里面的数据居然不能下载 天那个噜啊,只能自己爬了,还好是静态网页比较简单,只是绕过验证码稍微麻烦些,代码如下:
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
from tqdm import *
from urllib import request
from selenium.webdriver.common.action_chains import ActionChains #导入控制鼠标事件的方法
import warnings
warnings.filterwarnings("ignore")
def loginSys(loginName, password):
'''
登陆
利用webdriver驱动打开浏览器,操作页面
此处使用火狐浏览器
'''
print('开始登陆')
target = 'https://home.sudiyi.cn/'
driver = webdriver.Chrome()
driver.get(target)
driver.implicitly_wait(0.1)
LoginTitle = driver.title
result = driver.title
if LoginTitle == result:
vcode = input("vcode:") # 请求验证码,人工识别后输入
driver.find_element_by_name('username').send_keys(username) # 找到用户名录入框并填写
time.sleep(0.2) # 停顿模拟真实操作情况,降低被网站发现的几率
driver.find_element_by_name('password').send_keys(password) # 找到输入密码录入框并填写
time.sleep(0.2)
driver.find_element_by_name('captcha').send_keys(vcode) # 找到验证码框并填写
driver.implicitly_wait(0.2)
driver.find_element_by_class_name("foot").click() # 点击登录
driver.implicitly_wait(10)
data = sniffData(driver,target)
pd.DataFrame(data).to_csv(r'C:\Users\lenovo\Desktop\kuaidiyuan.csv')
# 抓取数据完毕关闭浏览器
time.sleep(0.2)
# 最后关闭浏览器
driver.close()
def sniffData(driver,target): #在解析网页之前需要先 等待一下 否则 解析出的将是登录页
#selenium 只用来进入网页 之后得到cookies之后这个就没有作用了,尽量不要用它来进行网页跳转
total = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
for j in tqdm(range(1,72853)):
dict_ = {}
target_url = 'https://ucms.sudiyi.cn/admin/couriers?page='+str(j)
cookies = {'user_name': 'chenrui+%E9%99%88%E7%9D%BF',
'sudiyi.cas': 'BAh7DEkiD3Nlc3Npb25faWQGOgZFVEkiRWZhZjA4ZTA3OGQ5NmZiMTM2NTdj%0AZTYzMjE3YWJkZWQzNGI0MjNhYzU5YjE5Y2NhMTM4YWQ3NjdlMmVjNTdlOWUG%0AOwBGSSIIdWlkBjsARmkDj6wESSIJbmFtZQY7AEZJIgvpmYjnnb8GOwBUSSIN%0AdXNlcm5hbWUGOwBGSSIMY2hlbnJ1aQY7AFRJIgt0aWNrZXQGOwBGSSIlNjYz%0AY2Y2M2EwZTI1ZWEwM2MyOGFjZWY4MGEzYWRjMTQGOwBGSSILYXZhdGFyBjsA%0ARkkiLGh0dHA6Ly9jZG4uY20uc3VkaXlpLmNuL2Nhcy9kZWZhdWx0LnBuZwY7%0AAFRJIg1tb2RfbGlzdAY7AEZbCnsJOgdpZGkGOgluYW1lSSITdXNlcl9jZW50%0AZXJfbXMGOwBUOgxuYW1lX3poSSIX55So5oi3566h55CG57O757ufBjsAVDoI%0AdXJsSSIaaHR0cDovL3VjbXMuc3VkaXlpLmNuBjsAVHsJOwZpCDsHSSIVc3Vk%0AaXlpX29wZXJhdGlvbgY7AFQ7CEkiF%2Bi%2FkOe7tOeuoeeQhuezu%2Be7nwY7AFQ7%0ACUkiGWh0dHA6Ly9vbXMuc3VkaXlpLmNuBjsAVHsJOwZpGTsHSSIdc3VkaXlp%0AX2J1c2luZXNzX2FuYWx5c2lzBjsAVDsISSIa5paw57uP6JCl5YiG5p6Q57O7%0A57ufBjsAVDsJSSIbaHR0cHM6Ly9iYXNzLnN1ZGl5aS5jbgY7AFR7CTsGaRs7%0AB0kiDG5ld19ibXMGOwBUOwhJIhrmlrDkuJrliqHnrqHnkIbns7vnu58GOwBU%0AOwlJIiNodHRwOi8vbmV3Ym1zLnN1ZGl5aS5jbi9hZG1pbi8GOwBUewk7Bmkc%0AOwdJIgxNT05JVE9SBjsAVDsISSId5Lia5Yqh5pWw5o2u5a6e5pe255yL5p2%2F%0ABjsAVDsJSSIeaHR0cHM6Ly9tb25pdG9yLnN1ZGl5aS5jbgY7AFQ%3D%0A--015a53c1d5b7d35804393315fa41d88617c50db4'}
try:
req = requests.get(url=target_url, headers=headers, cookies=cookies,verify=False)
except Exception as e:
print('第一段出现问题:',e)
pd.DataFrame(total).to_csv(r'C:\Users\lenovo\Desktop\kuaidiyuan.csv')
exit()
else:
req.encoding = 'utf-8'
html = req.text
div_bf = BeautifulSoup(html, "lxml")
a = BeautifulSoup(str(div_bf.find_all(class_ = 'table table-bordered')), "lxml")
td_ = a.find_all('td')
for i in range(10,201,10):
dict_ = {}
list_one = td_[i-10:i]
# print(list_one)
dict_['姓名'] = str(list_one[1].string)
dict_['手机号'] = str(list_one[2].string)
dict_['快递品牌'] = str(list_one[4].string)
dict_['城市'] = str(list_one[5].string)
target_url_last = 'https://ucms.sudiyi.cn'+BeautifulSoup(str(list_one[9])).find_all('a')[0].get('href')
# print(target_url_last)
try:
req = requests.get(url=target_url_last, headers=headers, cookies=cookies,verify=False)
except Exception as e:
print('第二段出现问题:',e)
pd.DataFrame(total).to_csv(r'C:\Users\lenovo\Desktop\kuaidiyuan.csv')
exit()
else:
req.encoding = 'utf-8'
html = req.text
div_bf = BeautifulSoup(html, "lxml")
shenfenzheng = div_bf.find_all(text =re.compile(r"\d{17}\S"))
if shenfenzheng:
shenfenzheng = shenfenzheng[0]
else:
shenfenzheng = '数据缺失'
dict_['身份证号'] = shenfenzheng
# print(dict_)
total.append(dict_)
return total
if __name__ == "__main__":
username = "nicaicaikan"
password = "wobucai"
loginSys(username, password)
因为用selenium只是用来绕过 登陆 这一关,加之进入网站之后的跳转次数较多,所以进去网站之后我们就用beautifulsoup对网页进行解析。
结果如下:
这个Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,如果我们不安装它,则 Python 会使用 Python默认的解析器,lxml 解析器更加强大,速度更快,推荐安装。
之前用Python默认的解析器,大概需要394个小时才能跑完程序,改成lxml解析器,需要284个小时,虽然速度还是很慢,但是提高了25%。
大家可以看到这个速度是非常之慢的,需要大概284个小时才能爬取完毕,这个速度太慢了,满足不了需求,所以接下来我们要使用 多线程 和 lxml 来加快速度。