爬取企查查公司URL

最新推荐文章于 2023-10-07 11:30:20 发布

凤枭香

最新推荐文章于 2023-10-07 11:30:20 发布

阅读量504

点赞数 1

分类专栏：爬虫文章标签：爬虫 html python

本文链接：https://blog.csdn.net/yangzheng_520/article/details/120138048

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver import ActionChains

a = []
def login(driver):
    driver.delete_all_cookies()
    url = "https://www.qcc.com/weblogin?back=%2F" #https://www.qcc.com/weblogin?back=%2F
    driver.get(url)
    time.sleep(10)

     # 点击密码登入/html/body/div[1]/div[3]/div/div[2]/div[1]/div[2]/a
    driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div[2]/div[1]/div[2]/a').click()
    time.sleep(1)

    # 输入账号密码
    #driver.find_element_by_id('nameNormal').send_keys(username)  # /html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[1]/input
    driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[1]/input').send_keys("199......")
    driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[2]/input').send_keys("13.......")

    button = driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[3]/div/div/div[1]/span')
    #滑动滑块//*[@id="nc_1__scale_text"]/span//*[@id="nc_1__scale_text"]
    ActionChains(driver).click_and_hold(button).perform()

    ActionChains(driver).move_by_offset(xoffset=308, yoffset=0).perform()
    ActionChains(driver).release().perform()

    time.sleep(2)#/html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[4]/button
    driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div[2]/div[3]/form/div[4]/button/strong').click()# 点击登录
    time.sleep(0.5)#https://www.qcc.com/web/search?key=%E6%B7%98%E5%AE%9D&p=2
    url_a = ['https://www.qcc.com/web/search?key=%E6%B7%98%E5%AE%9D&p={}',#淘宝
             'https://www.qcc.com/web/search?key=%E5%A9%9A%E5%BA%86&p={}',#婚庆
             'https://www.qcc.com/web/search?key=%E6%8A%A5%E7%A4%BE&p={}',#报社
             'https://www.qcc.com/web/search?key=%E7%A7%91%E6%8A%80&p={}',#科技
             'https://www.qcc.com/web/search?key=%E7%94%B5%E5%AD%90&p={}',#电子
             'https://www.qcc.com/web/search?key=%E7%94%9F%E6%B4%BB&p={}'#生活
             ]
    num = 1
    for r in url_a:
        for j in range(1,6):
            driver.get(r.format(j))
            for i in range(1,20):
                try:
                    d = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[4]/div/div[2]/div/table/tr[{}]/td[3]/div/a[1]'.format(i))
                    print('第{}条----->>>'.format(num),d.get_attribute("href"))
                    num += 1
                    a.append(d.get_attribute("href"))
                except:
                    pass
            time.sleep(5)
    da = pd.DataFrame(a)
    da.to_csv('E:\movieinfo3.csv')
    driver.close()


def main():
    while True:
        option = webdriver.ChromeOptions()#配置 chrome 启动是属性的类,就是初始化
        option.add_experimental_option('excludeSwitches', ['enable-automation'])  # webdriver防检测，开发者模式，防止被识别出来

        option.add_argument("--disable-blink-features=AutomationControlled")#添加启动参数，隐藏webdriver
        option.add_argument("--no-sandbox")#解决DevToolsActivePort文件不存在的报错
        option.add_argument("--disable-dev-usage")
        option.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})#添加实验性质的设置参数
        driver = webdriver.Chrome(executable_path=r"E:\chromedriver\chromedriver.exe",options=option)
        driver.set_page_load_timeout(15)#加载超时
        login(driver)#调用
        # jugesd(driver)



if __name__ == '__main__':
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}
    main()

凤枭香

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取企查查公司URL

import timeimport pandas as pdfrom selenium import webdriverfrom selenium.webdriver import ActionChainsa = []def login(driver): driver.delete_all_cookies() url = "https://www.qcc.com/weblogin?back=%2F" #https://www.qcc.com/weblogin?back=%2F
复制链接

扫一扫

专栏目录