企查查网站爬取
输入数据为企业名称的csv文件,输出为对应企业的名称、资金、地址、专利、范围、行业范围、股东、软著等信息的csv文件
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import sys,os
import imp
import csv
import pandas as pd
import re
from PIL import Image
from time import sleep
from io import BytesIO
from selenium.webdriver.common.action_chains import ActionChains
import time,random
imp.reload(sys)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.get('https://www.qichacha.com/user_login')
def qichacha(inc_list,outputfile,city,username,password) :
tag = driver.find_element_by_xpath('//*[@id="normalLogin"]') #登录
tag.click()
tag = driver.find_element_by_xpath('//*[@class="btn-weibo m-l-xs"]') #微博登录
tag.click()
# 将用户名、密码注入
driver.find_element_by_id('userId').send_keys(username) #账号
driver.find_element_by_id('passwd').send_keys(password) #密码
time.sleep(3) # 休眠,人工完成验证步骤,等待程序单击“登录”
btn = driver.find_element_by_xpath('//*[@id="outer"]/div/div[2]/form/div/div[2]/div/p/a[1]').click() # 微博二维码扫描登录
time.sleep(5)
count = 0
#################################################
for i in range(len(inc_list)):
fid = inc_list["FID"][i]
txt = inc_list["name"][i]
if city in txt:
txts = txt
else:
txts = city+txt
time.sleep