【无标题】

最新推荐文章于 2024-07-08 00:01:13 发布

ᓚᘏᗢovo

最新推荐文章于 2024-07-08 00:01:13 发布

阅读量41

点赞数

分类专栏： ptython 文章标签： python

本文链接：https://blog.csdn.net/weixin_46238205/article/details/132197010

版权

ptython 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python爬取天眼企业信息数据

ip：使用小象代理ip

从name.txt文件中读取企业名称，爬取数据后放入data.csv中

import requests
from lxml import etree
import queue
import threading
from termcolor import cprint
import csv
import os

def new_session():
    session = requests.session();
    while True:
        try:#https://www.qcc.com/   https://www.tianyancha.com/
            session.get(url='https://www.tianyancha.com/',headers=headers,timeout=(2,2))
            cprint('new session','yellow')
            return session
        except:
            cprint('network error','yellow')

def get_html(session,url,flag):
    while True:
        try:
            res = session.get(url=url,headers=headers,timeout=(5,5),proxies=proxies).content.decode('utf-8',errors='ignore')
            cprint(res,'red')
            if flag in res:
                return session,res
            session = new_session()
            cprint( 'error response ', 'red')
        except:
            cprint( 'newwork timeout ', 'yellow')

def analyze_out(res, name, dic):
    tree = etree.HTML(res)
    cprint(tree.text,'red')
    # dizhi = tree.xpath( '/html/body/div/div[2]/div/div[2]/section/main/div[2]/div[2]/div[1]/div/div[2]/div[2]/div[1]/div[1]/a/@href')
    # cprint("dizhi"+dizhi,'green')
    table = tree.xpath( '//table[@class="ntable ntable-list"]/tr')
    if not table:
        return dic
    item = table[0]
    res_name = ''.join(item.xpath( './/span[@class="copy-title"]/a//text()'))
   
    if res_name != name :
        return dic
    dic['exist'] ='是'
    dic['qccUrl'] = (item.xpath( '/html/body/div/div[2]/div/div[2]/section/main/div[2]/div[2]/div[1]/div/div[2]/div[2]/div[1]/div[1]/a/@href' ) +[''])[0]
    dic['regStatus'] = (item.xpath ( './/span[@class="copy-title"]/span/text()') +[ ''])[0]
    dic['estiblishTime'] = (item.xpath( ' .//span[@ciass="f" ]/span/text()')+['']*2)[1]
    dic['regLocation'] = ''.join(item.xpath( './/span[@class="copy-value address-map"]//text()'))
    return dic


def analyze_ins(res,dic):
    tree = etree.HTML(res)
    tds = tree.xpath( ' //div[@olass="cominfo-normal" ] /table/tr/td ')
    i =0
    for td in tds:
        if '所属行业' in td.xpath(' ./text()'):
            dic[ ' companyOrgType'] = (tds[i + 1].xpath( ' ./text()')+[''])[0].strip()
        if '所属地区' in td.xpath( ' ./text() '):
            dic[ ' province '] =(tds[i + 1].xpath(' ./text() ') +[''])[0]
        i += 1
    return dic

def thread_work(num):
    global totalNum
    session = new_session()
    while True:
        if nameQueue.empty():
            break
        name = nameQueue.get()
        cprint( name, 'red')
        dic = {
            'companyName' : name,
            'exist': '否',
            'qccUrl': '',
            'regstatus': '',
            'estiblishTime' : '',
            'regLocation' : '',
            ' province' : '',
            'company0rgType':''
        }
        url = f'https://www.tianyancha.com/search?key={name}'
        flag = f'<title>{name}_相关搜索结果-天眼查</title>'
        # url = f'https://www.qcc.com/search?key={name}'
        # flag = f'<title>{name}_相关搜索结果-天眼查</title>'
        cprint('准备获取','red')
        session,res = get_html(session,url, flag)
        dic = analyze_out(res,name,dic)
        if dic['exist'] =='是' and dic['qccUrl']:
            url = dic['qccUrl']
            flag = f'<title>{name} -企查查</title>'
            session,res = get_html(session,url,flag)
            dic = analyze_ins(res, dic)
        with threading.Lock():
            writer.writerow(list(dic.values()))
            totalNum += 1
            cprint(f'thread: {num} finishNum: {totalNum} data: {dic}' , 'green')




if __name__ == '__main__':
    threadNum = 5
    totalNum = 0
    cprint(f' program starts with {threadNum} threads ' , 'green')
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    headers = { 'User-Agent' : ua,}
    #小象代理
    proxyMeta = 'http://%(user)s:%(pass)s@%(host)s:%(port)s' % {
        'host' : 'http-short.xiaoxiangdaili.com',
        'port' : 10010,
        'user': '1006461268607258624',
        'pass' : 'RMnMPVHr'
    }
    proxies = {
        'http': proxyMeta,
        'https': proxyMeta
    }
    name_list = [name.strip() for name in open(r'C:\Users\Xiao\Desktop\qicc_news-master\src\企业查询\\name.txt', encoding='utf-8').readlines() if name.strip()]
    cprint(name_list,'red')
    if not os.path.exists( 'data.csv ' ):
        open( 'data.csv', 'w', encoding= 'utf-8-sig' ).write(
        'companyName , exist , qccUrl,negStatus,estiblishTime , negLocation , province , companyOrgType\n')
    writer = csv.writer(open('data.csv','a', encoding='utf-8-sig',newline=''))
    nameQueue = queue.Queue()
    for name in name_list:
        nameQueue.put(name)
    threadList = []
    for n in range(threadNum):
        t = threading.Thread(target=thread_work,args=(n + 1,))
        threadList.append(t)
        t.start()
    for t in threadList:
        t.join()
    cprint( 'proqram finishs', 'green')