爬虫实战——天眼查企业信息

爬虫实战——天眼查企业信息

# coding = utf-8
import urllib.request
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import numpy as np
import time

class SpiderSupplier:
    
    # 初始化
    def __init__(self, company):
        # 获取天眼查网址和企业id
        url = 'https://www.tianyancha.com/search?key={}'.format(company)
        self.url = urllib.request.quote(url, safe=";/?:@&=+$,", encoding='utf-8')
        
        for i in range(10):
            try:
                self.company_id = \
                    self.get_soup(self.url).find_all("a", class_="index_alink__zcia5 link-click")[0].get('href').split('/')[-1]
                print(company, "  company_id 获取" + "  第%d次连接中····"%(i+1))
                break
            except:
                print(company, "  company_id 无法获取" + "  第%d次连接失败,正在重连····"%(i+1))
                time.sleep(60)
                continue
        self.info = {}

    # 获取指定网址的 soup
    def get_soup(self, url):
        
        while True: 
            try:
                cookie = np.random.choice(
                    ['COOKIES']
                )
                usr_agent = np.random.choice(
                    [
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                    ]
                )

                head = {
                'User-Agent': usr_agent.encode('utf-8'),
                "cookie": cookie.encode('utf-8').decode('latin1')}

                request = urllib.request.Request(url, headers=head)
                break
            except:
                print("url", "   连接失败,正在重连····")
                time.sleep(5)
                continue
            
        request.encoding = 'utf-8'
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    # 判断是否空值
    def get_else(self, rule):
        try:
            return rule
        except:
            return ''

    # 文字和数字划分
    def split_text_num(self, s):
        for idx in range(len(s)):
            if s[idx].isdigit():
                return idx
        return len(s)

    # 获取基本信息
    def get_business_info(self, soup, info={}):
        
        text = json.loads(soup.find_all('script', {'id':'__NEXT_DATA__'})[0].text)['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']

        for i in ['regStatus', 
                 'emailList',
                 'companyShowBizTypeName',
                'regCapitalLabel',
                 'companyProfilePlainText',
                 'approvedTime',
                 'industry2017',
                 'businessScope',
                 'taxNumber',
                  'regCapitalCurrency',
                 'regCapitalAmount',
                  'taxQualification',
                 'name',
                 'baseInfo',
                 'regCapital',
                 'staffNumRange',
                 'industry',
                 'legalPersonName',
                 'regNumber',
                 'creditCode',
                 'fromTime',
                 'socialStaffNum',
                 'companyOrgType',
                 'taxAddress',
                 'actualCapital',
                 'estiblishTime',
                 'taxBankAccount',
                 'regLocation']:
            try:
                if 'Time' in i:
                    info[i] = time.strftime("%Y-%m-%d", time.localtime(int(text[i]/1000)))
                    continue
                info[i] = text[i]
            except:
                info[i] = ''
        return info

    def get_risk_info(self, soup, info):
        # 获取风险信息
        for i in soup.find_all("div", {"class": 'Risk_risk-item__G6j6A'}):
            i_text = i.text
            if i_text[:4]:
                info[i_text[:4]] = self.get_else(i_text[4:])
        return info

    def get_manage_info(self, soup, info):
        # 获取管理信息
        for i in soup.find_all('a', {'class': 'index_tag-nav-item__JZafL'}):
            i_text = i.text
            if i_text != '':
                idx = self.split_text_num(i_text)
                info[i_text[:idx]] = self.get_else(i_text[idx:])
        return info

    def get_sifa_info(self, soup, info):
        # 获取司法信息
        for i in soup.find_all("a", {"class": 'index_tag-nav-item__JZafL'}):
            i_text = i.text
            if i_text != '':
                idx = self.split_text_num(i_text)
                info[i_text[:idx]] = self.get_else(i_text[idx:])
        return info

    # 运行
    def run(self):
        # 增加工商信息
        business_info = self.get_soup('https://www.tianyancha.com/company/{}'.format(self.company_id))
        self.info = self.get_business_info(business_info, self.info)

        # 增加风险信息
        risk_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingxian'.format(self.company_id))
        self.info = self.get_risk_info(risk_soup, self.info)
        
        # 增加经营信息
        run_soup = self.get_soup('https://www.tianyancha.com/company/{}/jingzhuang'.format(self.company_id))
        self.info = self.get_manage_info(run_soup, self.info)
        
        # 增加税费信息
        tax_soup = self.get_soup(
            'https://capi.tianyancha.com/cloud-business-state/v3/ar/taxcred?gid={}&pageSize=10&pageNum=1'.format(
                self.company_id))
        tax_info = json.loads(str(tax_soup))
        try:
            self.info['税务评级'] = tax_info['data']['items'][0]['grade']
        except:
            self.info['税务评级'] = ''
        
        # 增加资质信息
        cet_soup = self.get_soup(
            'https://capi.tianyancha.com/cloud-business-state/certificate/list?graphId={}&pageSize=10&pageNum=1&type='.format(
                self.company_id))
        cet_info = json.loads(str(cet_soup))
        try:
            self.info['资格证书'] = [i['certificateName'] + ":" + i['certNo'] for i in cet_info['data']['resultList'] if
                                 i['endDate'] > "2023-05-01"]
        except:
            self.info['资格证书'] = ''
        
        # 增加建筑资质信息
        qual_soup = self.get_soup(
            'https://capi.tianyancha.com/cloud-company-background/construct/getQualificationList.json?gid={}&pageNum=1&pageSize=10&type='.format(
                self.company_id))
        try:
            qual_info = [i['certificateNum'] + '-' + i['qualificationName']
                         for i in json.loads(str(qual_soup))['data']['result']]
            self.info['建筑资质'] = qual_info
        except:
            self.info['建筑资质'] = ''
        
        # 增加司法信息
        sifa_soup = self.get_soup('https://www.tianyancha.com/company/{}/sifa'.format(self.company_id))
        self.info = self.get_sifa_info(sifa_soup, self.info)

        return self.info

if __name__ == '__main__':
	df = pd.read_excel('客户清单pd.xlsx', index_col=0)
	with open('info.json', 'w') as f:
    	json.dump({'西安大地工程检测有限公司':"1"}, f)

	fail_list = []
	
	for idx in df['签约对方'].index:
	    try:
	        print('*'*20, idx,'-', df['签约对方'][idx], '*'*20)
	        s = SpiderSupplier(df['签约对方'][idx])
	        a = s.run()
	        print(idx, '-', df['签约对方'][idx], '网址解析成功')
	    except:
	        print(idx, '-', df['签约对方'][idx], '网址解析错误')
	        fail_list.append(df['签约对方'][idx])
	        a = None
	
	    with open('info.json', 'r') as f:
	            data = json.load(f)
	
	    data[df['签约对方'][idx]] = a
	
	    with open('info.json', 'w') as f:
	        json.dump(data, f)

# 失败抓取重新获取
	for idx in range(len(fail_list)):
	    try:
	        print('*'*20, idx,'-', fail_list[idx], '*'*20)
	        s = SpiderSupplier(fail_list[idx])
	        a = s.run()
	        print(idx, '-', fail_list[idx], '网址解析成功')
	    except:
	        print(idx, '-', fail_list[idx], '网址解析错误')
	        a = None
	
	    with open('info.json', 'r') as f:
	            data = json.load(f)
	
	    data[fail_list[idx]] = a
	
	    with open('info.json', 'w') as f:
	        json.dump(data, f)
  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Python爬虫企业信息demo是一个使用Python编写的小型程序,旨在从互联网上爬取企业信息并进行展示。 首先,我们需要选择一个目标网站,该网站包含了我们想要获取的企业信息。然后,我们使用Python中的网络爬虫库,例如BeautifulSoup或Scrapy,来解析网页并提取所需的信息。 在爬虫程序中,我们首先发送请求到目标网站,获取网页的HTML代码。然后,使用解析库来解析HTML,找到包含企业信息的元素。根据网页的结构,我们可以使用标签、类名或其他属性来定位所需的信息。 获取到企业信息后,我们可以将其存储到本地文件或数据库中,方便后续的处理和分析。 除了简单的信息提取,我们还可以对爬取到的企业信息进行进一步的处理。例如,可以使用正则表达式对文本内容进行匹配和提取,筛选出我们想要的数据。还可以使用数据处理库,例如Pandas,对爬取到的数据进行清洗和整理。 最后,我们可以使用数据可视化库,例如Matplotlib或Seaborn,对爬取到的企业信息进行可视化分析。通过图表、图表和趋势图,我们可以更清楚地了解企业的特征和趋势。 总之,Python爬虫企业信息demo是一个实现企业信息爬取、处理和展示的小型程序。通过这个demo,我们可以学习和掌握Python爬虫的基本原理和技巧,以及数据处理和可视化的方法。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值