爬取企查查数据

最新推荐文章于 2024-06-07 09:46:02 发布
wuqi356
最新推荐文章于 2024-06-07 09:46:02 发布
阅读量1.3w
点赞数 6
分类专栏： python
本文链接：https://blog.csdn.net/wuqi356/article/details/97131962
版权
python 专栏收录该内容
14 篇文章 0 订阅
订阅专栏
爬取企查查数据用的代码，搜索信息，企查查最多爬100条，没有vip的话。
1.注意点就是会因为请求多的话，会导致请求不了。改cookies吧
#-*- coding-8 -*-
import requests
import lxml
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
import random
from pyquery import PyQuery as pq
 
def get_user_agent():
    user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
    uer_agent = random.choice(user_agent_list)
    return uer_agent

def get_ip():
    list = ["112.1.22.111", "200.34.98.11", "99.200.23.10","122.234.143.15","122.234.143.17","122.234.143.63",'1.0.1.0',
'1.0.2.0',
'1.0.8.0',
'1.0.32.0',
'1.1.0.0',
'1.1.2.0',
'1.1.4.0',
'1.1.8.0',
'1.1.16.0',
'1.1.32.0',
'1.2.0.0',
'1.2.2.0',
'1.2.5.0',
'1.2.6.0',
'1.2.8.0',
'1.2.16.0',
'1.2.32.0',
'1.2.64.0',
'1.3.0.0',
'1.4.1.0',
'1.4.2.0',
'1.4.4.0',
'1.4.8.0',
'1.4.16.0',
'1.4.32.0',
'1.4.64.0',
'1.8.0.0',
'1.8.64.0',
'1.8.96.0',
'1.8.100.0',
'1.8.112.0',
'1.8.128.0',
'1.8.144.0',
'1.8.148.0',
'1.8.154.0',
'1.8.156.0',
'1.8.160.0',
'1.8.192.0',
'1.8.224.0',
'1.8.244.0',
'1.8.248.0',
'1.10.0.0',
'1.10.8.0',
'1.10.11.0',
'1.10.12.0',
'1.10.16.0',
'1.10.32.0',
'1.10.64.0',
'1.12.0.0',
'1.24.0.0',
'1.45.0.0',
'1.48.0.0',
'1.56.0.0',
'1.68.0.0',
'1.80.0.0',
'1.116.0.0',
'1.180.0.0',
'1.184.0.0',
'1.188.0.0',
'1.192.0.0',
'1.202.0.0',
'1.204.0.0',
'1.213.105.0',
'12.118.130.0',
'12.126.40.0',
'14.0.0.0',
'14.0.12.0',
'14.1.0.0',
'14.1.24.0',
'14.1.108.0',
'14.16.0.0',
'14.102.128.0',
'14.102.180.0',
'14.103.0.0',
'14.104.0.0',
'14.112.0.0',
'14.130.0.0',
'14.134.0.0',
'14.144.0.0',
'14.192.56.0',
'14.192.76.0',
'14.196.0.0',
'14.204.0.0',
'14.208.0.0',
'20.134.160.0',
'20.139.160.0',
'27.0.128.0',
'27.0.160.0',
'27.0.188.0',
'27.8.0.0',
'27.16.0.0',
'27.34.232.0',
'27.36.0.0',
'27.40.0.0',
'27.50.40.0',
'27.50.128.0',
'27.54.72.0',
'27.54.152.0',
'27.54.192.0',
'27.98.208.0',
'27.98.224.0',
'27.99.128.0',
'27.103.0.0',
'27.106.128.0',
'27.106.204.0',
'27.109.32.0',
'27.109.124.0',
'27.112.0.0',
'27.112.80.0',
'27.112.112.0',
'27.113.128.0',
'27.115.0.0',
'27.116.44.0',
'27.121.72.0',
'27.121.120.0',
'27.128.0.0',
'27.131.220.0',
'27.144.0.0',
'27.148.0.0',
'27.152.0.0',
'27.184.0.0',
'27.192.0.0',
'27.224.0.0',
'36.0.0.0',
'36.0.16.0',
'36.0.32.0',
'36.0.64.0',
'36.0.128.0',
'36.1.0.0',
'36.4.0.0',
'36.16.0.0',
'36.32.0.0',
'36.36.0.0',
'36.37.0.0',
'36.37.36.0',
'36.37.39.0',
'36.37.40.0',
'36.37.48.0',
'36.40.0.0',
'36.48.0.0',
'36.51.0.0',
'36.51.128.0',
'36.51.192.0',
'36.51.224.0',
'36.51.240.0',
'36.51.248.0',
'36.51.252.0',
'36.56.0.0',
'36.96.0.0',
'36.128.0.0',
'36.192.0.0',
'36.248.0.0',
'36.254.0.0',
'36.255.116.0',
'36.255.128.0',
'36.255.164.0',
'36.255.172.0',
'36.255.176.0',
'39.0.0.0',
'39.0.2.0',
'39.0.4.0',
'39.0.8.0',
'39.0.16.0',
'39.0.32.0',
'39.0.64.0',
'39.0.128.0',
'39.64.0.0',
'39.96.0.0',
'39.104.0.0',
'39.108.0.0',
'39.128.0.0',
'40.0.176.0',
'40.0.247.0',
'40.0.248.0',
'40.0.252.0',
'40.0.255.0',
'40.72.0.0',
'40.125.128.0',
'40.126.64.0',
'40.198.10.0',
'40.198.16.0',
'40.198.24.0',
'40.251.225.0',
'40.251.227.0',
'42.0.0.0',
'42.0.8.0',
'42.0.16.0',
'42.0.24.0',
'42.0.32.0',
'42.0.128.0',
'42.0.160.0',
'42.0.176.0',
'42.0.184.0',
'42.0.186.0',
'42.0.188.0',
'42.0.192.0',
'42.0.208.0',
'42.0.216.0',
'42.0.220.0',
'42.0.223.0',
'42.0.224.0',
'42.1.0.0',
'42.1.32.0',
'42.1.48.0',
'42.1.56.0',
'42.4.0.0',
'42.48.0.0',
'42.56.0.0',
'42.62.0.0',
'42.62.128.0',
'42.62.160.0',
'42.62.180.0',
'42.62.184.0',
'42.63.0.0',
'42.80.0.0',
'42.83.64.0',
'42.83.80.0',
'42.83.88.0',
'42.83.96.0',
'42.83.128.0',
'42.83.134.0',
'42.83.140.0',
'42.83.142.0',
'42.83.144.0',
'42.83.160.0',
'42.83.192.0',
'42.84.0.0',
'42.88.0.0',
'42.96.64.0',
'42.96.96.0',
'42.96.108.0',
'42.96.112.0',
'42.96.128.0',
'42.97.0.0',
'42.99.0.0',
'42.99.64.0',
'42.99.96.0',
'42.99.112.0',
'42.99.120.0',
'42.100.0.0',
'42.120.0.0',
'42.122.0.0',
'42.123.0.0',
'42.123.36.0',
'42.123.40.0',
'42.123.48.0',
'42.123.64.0',
'42.123.128.0',
'42.123.160.0',
'42.123.164.0',
'42.123.166.0',
'42.123.168.0',
'42.123.176.0',
'42.123.192.0',
'42.128.0.0',
'42.156.0.0',
'42.156.36.0',
'42.156.40.0',
'42.156.48.0',
'42.156.64.0',
'42.156.128.0',
'42.157.0.0',
'42.158.0.0',
'42.160.0.0',
'42.176.0.0',
'42.184.0.0',
'42.186.0.0',
'42.187.0.0',
'42.187.64.0',
'42.187.96.0',
'42.187.112.0',
'42.187.120.0',
'42.187.128.0',
'42.192.0.0',
'42.201.0.0',
'42.202.0.0',
'42.204.0.0',
'42.208.0.0',
'42.224.0.0',
'42.240.0.0',
'42.242.0.0',
'42.244.0.0',
'42.248.0.0',
'43.224.12.0',
'43.224.24.0',
'43.224.44.0',
'43.224.52.0',
'43.224.56.0',
'43.224.64.0',
'43.224.72.0',
'43.224.80.0',
'43.224.100.0',
'43.224.144.0',
'43.224.160.0',
'43.224.176.0',
'43.224.184.0',
'43.224.200.0',
'43.224.208.0',
'43.224.216.0',
'43.224.240.0',
'43.225.76.0',
'43.225.84.0',
'43.225.120.0',
'43.225.180.0',
'43.225.184.0',
'43.225.208.0',
'43.225.216.0',
'43.225.224.0',
'43.225.240.0',
'43.225.252.0',
'43.226.32.0',
'43.226.64.0',
'43.226.96.0',
'43.226.112.0',
'43.226.120.0',
'43.226.128.0',
'43.226.160.0',
'43.226.236.0',
'43.226.240.0',
'43.227.0.0',
'43.227.8.0',
'43.227.32.0',
'43.227.64.0',
'43.227.104.0',
'43.227.136.0',
'43.227.144.0',
'43.227.152.0',
'43.227.160.0',
'43.227.176.0',
'43.227.188.0',
'43.227.192.0',
'43.227.232.0',
'43.227.248.0',
'43.228.0.0',
'43.228.64.0',
'43.228.76.0',
'43.228.100.0',
'43.228.116.0',
'43.228.132.0',
'43.228.136.0',
'43.228.148.0',
'43.228.152.0',
'43.228.188.0',
'43.229.40.0',
'43.229.56.0',
'43.229.96.0',
'43.229.136.0',
'43.229.168.0',
'43.229.176.0',
'43.229.192.0',
'43.229.216.0',
'43.229.232.0',
'43.230.20.0',
'43.230.32.0',
'43.230.68.0',
'43.230.72.0',
'43.230.84.0',
'43.230.124.0',
'43.230.220.0',
'43.230.224.0',
'43.231.12.0',
'43.231.32.0',
'43.231.80.0',
'43.231.96.0',
'43.231.136.0',
'43.231.144.0',
'43.231.160.0',
'43.231.176.0',
'43.236.0.0',
'43.238.0.0',
'43.239.0.0',
'43.239.32.0',
'43.239.48.0',
'43.239.116.0',
'43.239.120.0',
'43.239.172.0',
'43.240.0.0',
'43.240.56.0',
'43.240.68.0',
'43.240.72.0',
'43.240.84.0',
'43.240.124.0',
'43.240.128.0',
'43.240.136.0',
'43.240.156.0',
'43.240.160.0',
'43.240.192.0',
'43.240.240.0',
'43.241.0.0',
'43.241.16.0',
'43.241.48.0',
'43.241.76.0',
'43.241.80.0',
'43.241.112.0',
'43.241.168.0',
'43.241.176.0',
'43.241.184.0',
'43.241.208.0',
'43.241.224.0',
'43.241.240.0',
'43.241.248.0',
'43.242.8.0',
'43.242.16.0',
'43.242.48.0',
'43.242.64.0',
'43.242.72.0',
'43.242.80.0',
'43.242.96.0',
'43.242.144.0',
'43.242.160.0',
'43.242.180.0'
]
    ip = random.choice(list)
    return ip


def craw(url,key_word,x):
#    if x == 0:
#        re = 'http://www.qichacha.com/search?key='+key_word
#    else:
#        re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)
    re = r'https://www.qichacha.com/search?key='+key_word
    headers = {
            'Host':'www.qichacha.com',
            'Connection': 'keep-alive',
            'Accept':r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':get_user_agent(),
            'Referer': re,
            'X-Forwarded-For': get_ip(),
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cookie':r'QCCSESSID=ii39oo0rtltvogj0bq5ur1mul3; zg_did=%7B%22did%22%3A%20%2216c07be94cb9e-05ecf112418431-454c092b-1fa400-16c07be94cc690%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201563498026192%2C%22updated%22%3A%201563498026197%2C%22info%22%3A%201563498026194%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%7D; hasShow=1',
            }
 
    try:
        response = requests.get(url,headers = headers)
        if response.status_code != 200:
            response.encoding = 'utf-8'
            print(response.status_code)
            print('ERROR')    
        soup = pq(response.text)
        com_all_info = soup.find(".m_srchList")
        
    except Exception:
        print('请求都不让，这企查查是想逆天吗？？？')
    try:
        com_all_info_array = com_all_info.find("tr")
        print('开始爬取数据，请勿打开excel')
        for tr in com_all_info_array.items():
            mtxs = tr.find(".m-t-xs")
            temp_g_name = tr.find(".ma_h1").text()    #获取公司名
            temp_g_tag = tr.find("p").eq(3).text().split(u"品牌/产品：")    #获取公司标签
            print(temp_g_tag)
            temp_r_name = mtxs.eq(0).find("a").text()    #获取法人名
            temp_g_money = mtxs.eq(0).find(".m-l").eq(0).text().replace(u"注册资本：", "")   #获取注册资本
            temp_g_date = mtxs.eq(0).find(".m-l").eq(1).text().replace(u"成立日期：", "")    #获取公司注册时间
            two = mtxs.eq(1).text().split(u"电话：")
            temp_r_email = two[0]   #获取法人Email
            temp_r_phone = two[1]    #获取法人手机号
            temp_g_addr = mtxs.eq(2).text()    #获取公司地址
            temp_g_state = tr.find(".nstatus").text()  #获取公司状态
                
            g_name_list.append(temp_g_name)
            g_tag_list.append(temp_g_tag)
            r_name_list.append(temp_r_name)
            g_money_list.append(temp_g_money)
            g_date_list.append(temp_g_date)
            r_email_list.append(temp_r_email)
            r_phone_list.append(temp_r_phone)
            g_addr_list.append(temp_g_addr)
            g_state_list.append(temp_g_state)
             
#            except Exception:
#                print('错误！')
    except Exception:
        print('好像被拒绝访问了呢...请稍后再试叭...')
         
if __name__ == '__main__':
    global g_name_list
    global g_tag_list
    global r_name_list
    global g_money_list
    global g_date_list
    global r_email_list
    global r_phone_list
    global g_addr_list
    global g_state_list
     
    g_name_list=[]
    g_tag_list=[]
    r_name_list=[]
    g_money_list=[]
    g_date_list=[]
    r_email_list=[]
    r_phone_list=[]
    g_addr_list=[]
    g_state_list=[]
    print(g_name_list)
    key_word = input('请输入您想搜索的关键词：')
    num = int(input('请输入您想检索的次数：'))+1
    sleep_time = int(input('请输入每次检索延时的秒数：'))
     
    key_word = urllib.parse.quote(key_word)
     
    print('正在搜索，请稍后')
     
    for x in range(1,num):
        url = r'https://www.qichacha.com/search_index?key={}&ajaxflag=1&p={}&'.format(key_word,x)
        s1 = craw(url,key_word,x)
        time.sleep(sleep_time)
    workbook = xlwt.Workbook()
    #创建sheet对象，新建sheet
    sheet1 = workbook.add_sheet('企查查数据', cell_overwrite_ok=True)
    #---设置excel样式---
    #初始化样式
    style = xlwt.XFStyle()
    #创建字体样式
    font = xlwt.Font()
    font.name = '仿宋'
#    font.bold = True #加粗
    #设置字体
    style.font = font
    #使用样式写入数据
    print('正在存储数据，请勿打开excel')
    #向sheet中写入数据
    name_list = ['公司名字','品牌/产品','法定法人','注册资本','成立日期','法人邮箱','法人电话','公司地址','公司状态']
    for cc in range(0,len(name_list)):
        sheet1.write(0,cc,name_list[cc],style)
    for i in range(0,len(g_name_list)):
        print(g_name_list[i])
        sheet1.write(i+1,0,g_name_list[i],style)#公司名字
        sheet1.write(i+1,1,g_tag_list[i],style)#公司标签
        sheet1.write(i+1,2,r_name_list[i],style)#法定法人
        sheet1.write(i+1,3,g_money_list[i],style)#注册资本
        sheet1.write(i+1,4,g_date_list[i],style)#成立日期
        sheet1.write(i+1,5,r_email_list[i],style)#法人邮箱
        sheet1.write(i+1,6,r_phone_list[i],style)#法人电话
        sheet1.write(i+1,7,g_addr_list[i],style)#公司地址
        sheet1.write(i+1,8,g_state_list[i],style)#公司状态
    #保存excel文件，有同名的直接覆盖
    workbook.save(r"D:\test .xls")
    print('保存完毕~')
wuqi356
关注
6
点赞
踩
33

收藏

觉得还不错? 一键收藏
0
评论
爬取企查查数据

爬取企查查数据用的代码，搜索信息，企查查最多爬100条，没有vip的话。1.注意点就是会因为请求多的话，会导致请求不了。改cookies吧#-*- coding-8 -*-import requestsimport lxmlimport sysfrom bs4 import BeautifulSoupimport xlwtimport timeimport urllib...
复制链接

扫一扫