企查查之企业信息查询自动化

最新推荐文章于 2024-08-08 07:35:57 发布

°千山

最新推荐文章于 2024-08-08 07:35:57 发布

阅读量5.6k

点赞数 6

分类专栏： Python 文章标签： python selenium chrome 数据库

本文链接：https://blog.csdn.net/Tangjes/article/details/106386304

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

1、今天主管提了个需求，需要查询上海市公布一批高新技术企业的基本

信息，名单大概有2890个，名单地址
http://stcsm.sh.gov.cn/gk/tzgq/gqgg/bsgqgg/jtgq/153798.htm

2、下载打开一看，pdf格式是这样子的，112页，这也不能直接用啊，

3、打开Chrome,搜索pdf 转 excel，成功将pdf 文件转换成了112个excel 表
继续搜索：python将Excel 表合并，csdn上大佬教程很多，天下代码一大抄，

不抄白不抄。

成功合并Excel表，整理丢到数据库，考虑实现方式。

4、理论上实现的方式：

登录网页版本实现的方式: 只要不停的检索公司名称即可得到公司信息

但是只有会员才能拿到全部信息，好的，去某宝花一块钱买一天的会员

近3000多条信息，手工查询肯定不行，想想上代码吧。

5、代码实现，理论上人通过浏览器也可以实现，

模拟浏览器登录---脚本自动填充搜索---下载信息---退出，一顿操作猛如虎，开始上代码：

#coding =utf-8

import time
from selenium.webdriver import Chrome
from selenium.webdriver import ActionChains                 #模仿鼠标动作的
from selenium.webdriver import ChromeOptions

def login():
    global driver
    #去除浏览器页面显示自动化脚本控制的提示
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = Chrome(options=option)
    #打开企查查页面
    driver.get('https://www.qcc.com/?utm_source=baidu&utm_medium=cpc&utm_term=pzsy')
    #点击登陆
    driver.find_element_by_xpath("//div/ul/li[14]/a/span").click()
    time.sleep(1)
    #定位到新弹框
    driver.current_window_handle
    #点击密码登录
    driver.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/div[2]').click()
    # 定位账号输入框，输入账号
    login = driver.find_element_by_id('nameNormal')
    time.sleep(1.2)
    login.send_keys('13785607088')
    # 定位密码输入框，输入密码
    pwd = driver.find_element_by_id('pwdNormal')
    time.sleep(1)
    pwd.send_keys('123456A')
    #定位滑块输入框
    hua_kuai = driver.find_element_by_id('nc_2_n1z')
    return hua_kuai

def get_track(distance):            # distance为传入的总距离
    global driver
    track = []                      # 移动轨迹
    current = 0                     # 当前位移
    mid = distance *3/ 5            # 减速阈值
    t = 0.2                         # 计算间隔
    v = 1                           # 初速度
    while current < distance:
        if current < mid:
            a = 4               # 加速度为2
        else:
            a = -3              # 加速度为-2
        v0 = v                  # 当前速度
        v = v0 + a * t          # 移动距离f
        move = v0 * t + 1 / 2 * a * t * t
                                # 当前位移
        current += move         # 加入轨迹
        track.append(round(move))
    return track

def move_to_gap(slider, tracks):     # slider是要移动的滑块,tracks是要传入的移动轨迹
    global driver
    ActionChains(driver).click_and_hold(slider).perform()
    for x in tracks:
        ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
    time.sleep(0.5)
    ActionChains(driver).release().perform()
    time.sleep(1)
    driver.find_element_by_id('user_login_normal').click() #点击登陆

if __name__=='__main__':
    move_to_gap(login(), get_track(308))

结果被企查查登录页面打脸，selenium 调用Chrome会被检测，滑块验证不通过

百度了一下，各种方式试了一下，不太好解决，修改了一下代码最后偷懒使用页

面扫码登陆。登陆成功页面

上代码：
'''from utils import config 
   from utils import del_Label 
   from utils import myslq_client
'''
其中上面这三行是连接数据库的基本配置文件，按照个人的喜好自己配置即可
完整代码：
#coding =utf-8

import time
from lxml import etree
from utils import config
from utils import del_Label
from utils import myslq_client
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys


def getDetailInfo(data,k,count):
    #列表页解析信息
    name = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/a/em/text()')
    name = ",".join(",".join(name)).replace(",", '')
    faRen = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/a/text()')[0]
    zhuCe = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[1]/text()')[0]
    Ctime = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[2]/text()')[0]
    mail = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[2]/text()')
    mail = "".join(mail)
    mail = del_Label.Deltag2(str(mail))
    mail = "".join(mail)
    phone = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[2]/span/text()')
    phone = ''.join(phone)
    phone = del_Label.Deltag2(str(phone))
    address = data.xpath('//*[@id="search-result"]/tr[1]/td[3]/p[3]/text()')
    address = ''.join(address)
    address = del_Label.Deltag2(str(address))
    data = {
        "name": name,
        "faRen": faRen,
        "zhuCe": zhuCe,
        "Ctime": Ctime,
        "mail": mail,
        "phone": phone,
        "address": address,
    }
    count = count+1
    # config.qiChaCha(data)  存储到mysql数据库
    print(data)
    print("当前采集第{count}条，当前查询公司是：{company}".format(count=count, company=k))

def login():
    try:
        global driver
        #去除浏览器界面 浏览器正在接受自动化程序控制
        option = ChromeOptions()
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = Chrome(options=option)
        #打开企查查
        driver.get('https://www.qcc.com/?utm_source=baidu&utm_medium=cpc&utm_term=pzsy')
        #定位点击登录框
        driver.find_element_by_xpath("//div/ul/li[14]/a/span").click()
        time.sleep(1)
        #定位新弹框
        driver.current_window_handle
        driver.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/div[2]').click()
        # 设置延时，用企查查app扫码登陆
        time.sleep(15)
        #定位登录后页面搜索框，输入要查询的公司,按回车键
        input_company = driver.find_element_by_xpath('//*[@id="searchkey"]')
        input_company.clear()  # 清空输入框
        k = "上海欧朔智能包装科技有限公司"
        input_company.send_keys(k)
        input_company.send_keys(Keys.ENTER)  # 输入回车
        #下载单前页面，进行页面解析，提取数据
        s = driver.page_source
        data = etree.HTML(s)
        count = 0
        getDetailInfo(data,k,count)
        
        #页面发生了变化再次定位到新的页面
        driver.current_window_handle
        #连接数据库，调取数据
        db = myslq_client.ff_task()
        cursor = db.cursor()
        sql1 = "SELECT name  FROM companys limit 1,20;"
        cursor.execute(sql1)
        result = cursor.fetchall()
        count = 0
        for item in result:
            k = item[0]
            if len(k) > 4:
                time.sleep(1)
                #定位第二页输入框，传入
                compangy = driver.find_element_by_id('headerKey')
                compangy.clear()
                time.sleep(1)
                compangy.send_keys(k)
                compangy.send_keys(Keys.ENTER)
                # 下载单前页面，进行页面解析，提取数据
                s = driver.page_source
                data = etree.HTML(s)
                count+=1
                getDetailInfo(data,k,count)
    except Exception as e:
        print(e)
if __name__ =='__main__':
    login()

程序运行结果：