爬取企查查数据

爬取企查查数据用的代码,搜索信息,企查查最多爬100条,没有vip的话。

1.注意点就是会因为请求多的话,会导致请求不了。改cookies吧 

#-*- coding-8 -*-
import requests
import lxml
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
import random
from pyquery import PyQuery as pq
 
def get_user_agent():
    user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
    uer_agent = random.choice(user_agent_list)
    return uer_agent

def get_ip():
    list = ["112.1.22.111", "200.34.98.11", "99.200.23.10","122.234.143.15","122.234.143.17","122.234.143.63",'1.0.1.0',
'1.0.2.0',
'1.0.8.0',
'1.0.32.0',
'1.1.0.0',
'1.1.2.0',
'1.1.4.0',
'1.1.8.0',
'1.1.16.0',
'1.1.32.0',
'1.2.0.0',
'1.2.2.0',
'1.2.5.0',
'1.2.6.0',
'1.2.8.0',
'1.2.16.0',
'1.2.32.0',
'1.2.64.0',
'1.3.0.0',
'1.4.1.0',
'1.4.2.0',
'1.4.4.0',
'1.4.8.0',
'1.4.16.0',
'1.4.32.0',
'1.4.64.0',
'1.8.0.0',
'1.8.64.0',
'1.8.96.0',
'1.8.100.0',
'1.8.112.0',
'1.8.128.0',
'1.8.144.0',
'1.8.148.0',
'1.8.154.0',
'1.8.156.0',
'1.8.160.0',
'1.8.192.0',
'1.8.224.0',
'1.8.244.0',
'1.8.248.0',
'1.10.0.0',
'1.10.8.0',
'1.10.11.0',
'1.10.12.0',
'1.10.16.0',
'1.10.32.0',
'1.10.64.0',
'1.12.0.0',
'1.24.0.0',
'1.45.0.0',
'1.48.0.0',
'1.56.0.0',
'1.68.0.0',
'1.80.0.0',
'1.116.0.0',
'1.180.0.0',
'1.184.0.0',
'1.188.0.0',
'1.192.0.0',
'1.202.0.0',
'1.204.0.0',
'1.213.105.0',
'12.118.130.0',
'12.126.40.0',
'14.0.0.0',
'14.0.12.0',
'14.1.0.0',
'14.1.24.0',
'14.1.108.0',
'14.16.0.0',
'14.102.128.0',
'14.102.180.0',
'14.103.0.0',
'14.104.0.0',
'14.112.0.0',
'14.130.0.0',
'14.134.0.0',
'14.144.0.0',
'14.192.56.0',
'14.192.76.0',
'14.196.0.0',
'14.204.0.0',
'14.208.0.0',
'20.134.160.0',
'20.139.160.0',
'27.0.128.0',
'27.0.160.0',
'27.0.188.0',
'27.8.0.0',
'27.16.0.0',
'27.34.232.0',
'27.36.0.0',
'27.40.0.0',
'27.50.40.0',
'27.50.128.0',
'27.54.72.0',
'27.54.152.0',
'27.54.192.0',
'27.98.208.0',
'27.98.224.0',
'27.99.128.0',
'27.103.0.0',
'27.106.128.0',
'27.106.204.0',
'27.109.32.0',
'27.109.124.0',
'27.112.0.0',
'27.112.80.0',
'27.112.112.0',
'27.113.128.0',
'27.115.0.0',
'27.116.44.0',
'27.121.72.0',
'27.121.120.0',
'27.128.0.0',
'27.131.220.0',
'27.144.0.0',
'27.148.0.0',
'27.152.0.0',
'27.184.0.0',
'27.192.0.0',
'27.224.0.0',
'36.0.0.0',
'36.0.16.0',
'36.0.32.0',
'36.0.64.0',
'36.0.128.0',
'36.1.0.0',
'36.4.0.0',
'36.16.0.0',
'36.32.0.0',
'36.36.0.0',
'36.37.0.0',
'36.37.36.0',
'36.37.39.0',
'36.37.40.0',
'36.37.48.0',
'36.40.0.0',
'36.48.0.0',
'36.51.0.0',
'36.51.128.0',
'36.51.192.0',
'36.51.224.0',
'36.51.240.0',
'36.51.248.0',
'36.51.252.0',
'36.56.0.0',
'36.96.0.0',
'36.128.0.0',
'36.192.0.0',
'36.248.0.0',
'36.254.0.0',
'36.255.116.0',
'36.255.128.0',
'36.255.164.0',
'36.255.172.0',
'36.255.176.0',
'39.0.0.0',
'39.0.2.0',
'39.0.4.0',
'39.0.8.0',
'39.0.16.0',
'39.0.32.0',
'39.0.64.0',
'39.0.128.0',
'39.64.0.0',
'39.96.0.0',
'39.104.0.0',
'39.108.0.0',
'39.128.0.0',
'40.0.176.0',
'40.0.247.0',
'40.0.248.0',
'40.0.252.0',
'40.0.255.0',
'40.72.0.0',
'40.125.128.0',
'40.126.64.0',
'40.198.10.0',
'40.198.16.0',
'40.198.24.0',
'40.251.225.0',
'40.251.227.0',
'42.0.0.0',
'42.0.8.0',
'42.0.16.0',
'42.0.24.0',
'42.0.32.0',
'42.0.128.0',
'42.0.160.0',
'42.0.176.0',
'42.0.184.0',
'42.0.186.0',
'42.0.188.0',
'42.0.192.0',
'42.0.208.0',
'42.0.216.0',
'42.0.220.0',
'42.0.223.0',
'42.0.224.0',
'42.1.0.0',
'42.1.32.0',
'42.1.48.0',
'42.1.56.0',
'42.4.0.0',
'42.48.0.0',
'42.56.0.0',
'42.62.0.0',
'42.62.128.0',
'42.62.160.0',
'42.62.180.0',
'42.62.184.0',
'42.63.0.0',
'42.80.0.0',
'42.83.64.0',
'42.83.80.0',
'42.83.88.0',
'42.83.96.0',
'42.83.128.0',
'42.83.134.0',
'42.83.140.0',
'42.83.142.0',
'42.83.144.0',
'42.83.160.0',
'42.83.192.0',
'42.84.0.0',
'42.88.0.0',
'42.96.64.0',
'42.96.96.0',
'42.96.108.0',
'42.96.112.0',
'42.96.128.0',
'42.97.0.0',
'42.99.0.0',
'42.99.64.0',
'42.99.96.0',
'42.99.112.0',
'42.99.120.0',
'42.100.0.0',
'42.120.0.0',
'42.122.0.0',
'42.123.0.0',
'42.123.36.0',
'42.123.40.0',
'42.123.48.0',
'42.123.64.0',
'42.123.128.0',
'42.123.160.0',
'42.123.164.0',
'42.123.166.0',
'42.123.168.0',
'42.123.176.0',
'42.123.192.0',
'42.128.0.0',
'42.156.0.0',
'42.156.36.0',
'42.156.40.0',
'42.156.48.0',
'42.156.64.0',
'42.156.128.0',
'42.157.0.0',
'42.158.0.0',
'42.160.0.0',
'42.176.0.0',
'42.184.0.0',
'42.186.0.0',
'42.187.0.0',
'42.187.64.0',
'42.187.96.0',
'42.187.112.0',
'42.187.120.0',
'42.187.128.0',
'42.192.0.0',
'42.201.0.0',
'42.202.0.0',
'42.204.0.0',
'42.208.0.0',
'42.224.0.0',
'42.240.0.0',
'42.242.0.0',
'42.244.0.0',
'42.248.0.0',
'43.224.12.0',
'43.224.24.0',
'43.224.44.0',
'43.224.52.0',
'43.224.56.0',
'43.224.64.0',
'43.224.72.0',
'43.224.80.0',
'43.224.100.0',
'43.224.144.0',
'43.224.160.0',
'43.224.176.0',
'43.224.184.0',
'43.224.200.0',
'43.224.208.0',
'43.224.216.0',
'43.224.240.0',
'43.225.76.0',
'43.225.84.0',
'43.225.120.0',
'43.225.180.0',
'43.225.184.0',
'43.225.208.0',
'43.225.216.0',
'43.225.224.0',
'43.225.240.0',
'43.225.252.0',
'43.226.32.0',
'43.226.64.0',
'43.226.96.0',
'43.226.112.0',
'43.226.120.0',
'43.226.128.0',
'43.226.160.0',
'43.226.236.0',
'43.226.240.0',
'43.227.0.0',
'43.227.8.0',
'43.227.32.0',
'43.227.64.0',
'43.227.104.0',
'43.227.136.0',
'43.227.144.0',
'43.227.152.0',
'43.227.160.0',
'43.227.176.0',
'43.227.188.0',
'43.227.192.0',
'43.227.232.0',
'43.227.248.0',
'43.228.0.0',
'43.228.64.0',
'43.228.76.0',
'43.228.100.0',
'43.228.116.0',
'43.228.132.0',
'43.228.136.0',
'43.228.148.0',
'43.228.152.0',
'43.228.188.0',
'43.229.40.0',
'43.229.56.0',
'43.229.96.0',
'43.229.136.0',
'43.229.168.0',
'43.229.176.0',
'43.229.192.0',
'43.229.216.0',
'43.229.232.0',
'43.230.20.0',
'43.230.32.0',
'43.230.68.0',
'43.230.72.0',
'43.230.84.0',
'43.230.124.0',
'43.230.220.0',
'43.230.224.0',
'43.231.12.0',
'43.231.32.0',
'43.231.80.0',
'43.231.96.0',
'43.231.136.0',
'43.231.144.0',
'43.231.160.0',
'43.231.176.0',
'43.236.0.0',
'43.238.0.0',
'43.239.0.0',
'43.239.32.0',
'43.239.48.0',
'43.239.116.0',
'43.239.120.0',
'43.239.172.0',
'43.240.0.0',
'43.240.56.0',
'43.240.68.0',
'43.240.72.0',
'43.240.84.0',
'43.240.124.0',
'43.240.128.0',
'43.240.136.0',
'43.240.156.0',
'43.240.160.0',
'43.240.192.0',
'43.240.240.0',
'43.241.0.0',
'43.241.16.0',
'43.241.48.0',
'43.241.76.0',
'43.241.80.0',
'43.241.112.0',
'43.241.168.0',
'43.241.176.0',
'43.241.184.0',
'43.241.208.0',
'43.241.224.0',
'43.241.240.0',
'43.241.248.0',
'43.242.8.0',
'43.242.16.0',
'43.242.48.0',
'43.242.64.0',
'43.242.72.0',
'43.242.80.0',
'43.242.96.0',
'43.242.144.0',
'43.242.160.0',
'43.242.180.0'
]
    ip = random.choice(list)
    return ip


def craw(url,key_word,x):
#    if x == 0:
#        re = 'http://www.qichacha.com/search?key='+key_word
#    else:
#        re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)
    re = r'https://www.qichacha.com/search?key='+key_word
    headers = {
            'Host':'www.qichacha.com',
            'Connection': 'keep-alive',
            'Accept':r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':get_user_agent(),
            'Referer': re,
            'X-Forwarded-For': get_ip(),
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cookie':r'QCCSESSID=ii39oo0rtltvogj0bq5ur1mul3; zg_did=%7B%22did%22%3A%20%2216c07be94cb9e-05ecf112418431-454c092b-1fa400-16c07be94cc690%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201563498026192%2C%22updated%22%3A%201563498026197%2C%22info%22%3A%201563498026194%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%7D; hasShow=1',
            }
 
    try:
        response = requests.get(url,headers = headers)
        if response.status_code != 200:
            response.encoding = 'utf-8'
            print(response.status_code)
            print('ERROR')    
        soup = pq(response.text)
        com_all_info = soup.find(".m_srchList")
        
    except Exception:
        print('请求都不让,这企查查是想逆天吗???')
    try:
        com_all_info_array = com_all_info.find("tr")
        print('开始爬取数据,请勿打开excel')
        for tr in com_all_info_array.items():
            mtxs = tr.find(".m-t-xs")
            temp_g_name = tr.find(".ma_h1").text()    #获取公司名
            temp_g_tag = tr.find("p").eq(3).text().split(u"品牌/产品:")    #获取公司标签
            print(temp_g_tag)
            temp_r_name = mtxs.eq(0).find("a").text()    #获取法人名
            temp_g_money = mtxs.eq(0).find(".m-l").eq(0).text().replace(u"注册资本:", "")   #获取注册资本
            temp_g_date = mtxs.eq(0).find(".m-l").eq(1).text().replace(u"成立日期:", "")    #获取公司注册时间
            two = mtxs.eq(1).text().split(u"电话:")
            temp_r_email = two[0]   #获取法人Email
            temp_r_phone = two[1]    #获取法人手机号
            temp_g_addr = mtxs.eq(2).text()    #获取公司地址
            temp_g_state = tr.find(".nstatus").text()  #获取公司状态
                
            g_name_list.append(temp_g_name)
            g_tag_list.append(temp_g_tag)
            r_name_list.append(temp_r_name)
            g_money_list.append(temp_g_money)
            g_date_list.append(temp_g_date)
            r_email_list.append(temp_r_email)
            r_phone_list.append(temp_r_phone)
            g_addr_list.append(temp_g_addr)
            g_state_list.append(temp_g_state)
             
#            except Exception:
#                print('错误!')
    except Exception:
        print('好像被拒绝访问了呢...请稍后再试叭...')
         
if __name__ == '__main__':
    global g_name_list
    global g_tag_list
    global r_name_list
    global g_money_list
    global g_date_list
    global r_email_list
    global r_phone_list
    global g_addr_list
    global g_state_list
     
    g_name_list=[]
    g_tag_list=[]
    r_name_list=[]
    g_money_list=[]
    g_date_list=[]
    r_email_list=[]
    r_phone_list=[]
    g_addr_list=[]
    g_state_list=[]
    print(g_name_list)
    key_word = input('请输入您想搜索的关键词:')
    num = int(input('请输入您想检索的次数:'))+1
    sleep_time = int(input('请输入每次检索延时的秒数:'))
     
    key_word = urllib.parse.quote(key_word)
     
    print('正在搜索,请稍后')
     
    for x in range(1,num):
        url = r'https://www.qichacha.com/search_index?key={}&ajaxflag=1&p={}&'.format(key_word,x)
        s1 = craw(url,key_word,x)
        time.sleep(sleep_time)
    workbook = xlwt.Workbook()
    #创建sheet对象,新建sheet
    sheet1 = workbook.add_sheet('企查查数据', cell_overwrite_ok=True)
    #---设置excel样式---
    #初始化样式
    style = xlwt.XFStyle()
    #创建字体样式
    font = xlwt.Font()
    font.name = '仿宋'
#    font.bold = True #加粗
    #设置字体
    style.font = font
    #使用样式写入数据
    print('正在存储数据,请勿打开excel')
    #向sheet中写入数据
    name_list = ['公司名字','品牌/产品','法定法人','注册资本','成立日期','法人邮箱','法人电话','公司地址','公司状态']
    for cc in range(0,len(name_list)):
        sheet1.write(0,cc,name_list[cc],style)
    for i in range(0,len(g_name_list)):
        print(g_name_list[i])
        sheet1.write(i+1,0,g_name_list[i],style)#公司名字
        sheet1.write(i+1,1,g_tag_list[i],style)#公司标签
        sheet1.write(i+1,2,r_name_list[i],style)#法定法人
        sheet1.write(i+1,3,g_money_list[i],style)#注册资本
        sheet1.write(i+1,4,g_date_list[i],style)#成立日期
        sheet1.write(i+1,5,r_email_list[i],style)#法人邮箱
        sheet1.write(i+1,6,r_phone_list[i],style)#法人电话
        sheet1.write(i+1,7,g_addr_list[i],style)#公司地址
        sheet1.write(i+1,8,g_state_list[i],style)#公司状态
    #保存excel文件,有同名的直接覆盖
    workbook.save(r"D:\test .xls")
    print('保存完毕~')

 

  • 6
    点赞
  • 33
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: 要使用Python爬取企查查数据,可以按照以下步骤进行操作: 1. 导入所需的Python库,如requests、beautifulsoup和pandas。 ``` import requests from bs4 import BeautifulSoup import pandas as pd ``` 2. 构造请求URL,可以通过企查查网站检索得到相应公司的URL,或者直接拼接URL。 ``` url = "https://www.qichacha.com/company_search?key=关键词" ``` 3. 发送HTTP请求,获取网页内容。 ``` headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"} response = requests.get(url, headers=headers) ``` 4. 使用BeautifulSoup解析网页内容,提取所需数据。 ``` soup = BeautifulSoup(response.text, 'html.parser') # 定位到要提取的数据的HTML标签 data = soup.find_all('div', class_='search_list')[0].find_all('tr') ``` 5. 将提取的数据存储到DataFrame中,方便后续处理和分析。 ``` records = [] for tr in data: record = [] for td in tr.find_all('td'): record.append(td.text.strip()) records.append(record) df = pd.DataFrame(records) ``` 6. 对DataFrame进行必要的数据清洗和处理。 ``` df.columns = df.iloc[0] # 将第一行作为列名 df = df[1:] # 去除第一行数据 ``` 7. 可以选择将处理后的数据保存到本地文件,或者进行进一步的分析和可视化展示。 ``` df.to_csv('企查查数据.csv', index=False) ``` 注意:爬取网站数据要遵守相关法律法规和网站的使用条款,尊重网站规则并使用爬虫技术进行合法合规的数据获取。 ### 回答2: Python可以使用多种库进行网页数据爬取,其中比较常用的是BeautifulSoup和Selenium。 首先,我们需要安装相应的库,可以使用pip install进行安装。 ```python pip install beautifulsoup4 pip install selenium ``` 然后,我们需要导入所需的库。 ```python from selenium import webdriver from bs4 import BeautifulSoup import time ``` 接着,我们需要设置webdriver的路径,并实例化一个浏览器对象。 ```python # 设置webdriver的路径 driver_path = "chromedriver_path/chromedriver" # 实例化一个浏览器对象,这里以Chrome为例 browser = webdriver.Chrome(executable_path=driver_path) ``` 接下来,我们需要打开企查查的网页并进行相关操作,这里以搜索公司信息为例。 ```python # 打开企查查网页 url = "https://www.qcc.com" browser.get(url) # 找到搜索框并输入关键词 search_input = browser.find_element_by_id("headerKey") search_input.send_keys("公司名") # 找到搜索按钮并点击 search_button = browser.find_element_by_id("index-getData") search_button.click() # 等待页面加载 time.sleep(5) ``` 页面加载完成后,我们可以通过BeautifulSoup来解析所需的数据。 ```python # 获取页面源代码 page_source = browser.page_source # 使用BeautifulSoup解析数据 soup = BeautifulSoup(page_source, "html.parser") # 找到所需的数据,并进行相应的处理 data = soup.find_all("div", class_="result-list") ``` 最后,我们需要关闭浏览器。 ```python browser.quit() ``` 这样,我们就完成了使用Python爬取企查查数据的过程。可以根据需求对代码进行相应的修改和优化。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值